From aa2d5b8c5f72e474c115441873582128a9199f85 Mon Sep 17 00:00:00 2001
From: Saurabh Nanda <saurabh@vacationlabs.com>
Date: Wed, 4 Sep 2024 12:55:57 +0530
Subject: [PATCH 1/3] Create hackernews_with_comments.recipe

---
 recipes/hackernews_with_comments.recipe | 149 ++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 recipes/hackernews_with_comments.recipe
diff --git a/recipes/hackernews_with_comments.recipe b/recipes/hackernews_with_comments.recipe
new file mode 100644
index 0000000000..951a65fd78
--- /dev/null
+++ b/recipes/hackernews_with_comments.recipe
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+'''
+Hacker News (with comments)
+'''
+from calibre.ptempfile import PersistentTemporaryFile
+from calibre.web.feeds.news import BasicNewsRecipe
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+import re
+
+
+class HNWithComments(BasicNewsRecipe):
+    title = 'HN With actual comments'
+    __author__ = 'Tom Scholl & David Kerschner'
+    description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
+    publisher = 'Y Combinator'
+    category = 'news, programming, it, technology'
+    delay = 1
+    max_articles_per_feed = 20
+    oldest_article = 3
+    use_embedded_content = False
+    no_stylesheets = True
+    encoding = 'utf-8'
+    language = 'en'
+    requires_version = (0, 8, 16)
+
+    feeds = [
+        (u'Hacker News Frontpage', 'https://hnrss.org/frontpage'),
+        (u'Ask Hacker News', 'https://hnrss.org/ask')
+    ]
+
+    temp_files = []
+    articles_are_obfuscated = True
+
+    def get_readable_content(self, url):
+        self.log('get_readable_content(' + url + ')')
+        br = self.get_browser()
+        f = br.open(url)
+        html = f.read()
+        f.close()
+
+        return self.extract_readable_article(html, url)
+
+    def get_hn_content(self, url):
+        self.log('get_hn_content(' + url + ')')
+        soup = self.index_to_soup(url)
+        main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td
+
+        title_element = main.select('td.title .titleline a')[0]
+        self.log('title_element=' + repr(title_element))
+        title = self.tag_to_string(title_element)
+        self.log('title=' + title)
+        link = title_element['href']
+        # link = main.find('td', 'title').find('a')['href']
+        if link.startswith('item?'):
+            link = 'https://news.ycombinator.com/' + link
+        readable_link = link.rpartition('http://')[2].rpartition('https://')[2]
+        subtext = self.tag_to_string(main.find('td', 'subtext'))
+
+        title_content_td = main.find('td', 'title').findParent(
+            'tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1]
+        title_content = u''
+        if not title_content_td.find('form'):
+            title_content_td.name = 'div'
+            title_content = title_content_td.prettify()
+
+        comments = u''
+        for td in main.findAll('td', 'default'):
+            comhead = td.find('span', 'comhead')
+            if comhead:
+                com_title = u'<h4>' + \
+                    self.tag_to_string(comhead).replace(
+                        ' | link', '') + u'</h4>'
+                comhead.parent.extract()
+                br = td.find('br')
+                if br:
+                    br.extract()
+                reply = td.find('a', attrs={'href': re.compile('^reply?')})
+                if reply:
+                    reply.parent.extract()
+                td.name = 'div'
+                indent_width = (int(td.parent.find('td').img['width']) * 2) / 3
+                td['style'] = 'padding-left: ' + str(indent_width) + 'px'
+                comments = comments + com_title + td.prettify()
+
+        body = u'<h3>' + title + u'</h3><p><a href="' + link + u'">' + readable_link + \
+            u'</a><br/><strong>' + subtext + u'</strong></p>' + title_content + u'<br/>'
+        body = body + comments
+        return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
+
+    def parse_feeds(self):
+        a = super(HNWithCommentsLinkAlt, self).parse_feeds()
+        self.hn_articles = a[0].articles
+        return a
+
+    def get_obfuscated_article(self, url):
+        self.log('get_obfuscated_article with url=' + url)
+        if url.startswith('https://news.ycombinator.com'):
+            content = self.get_hn_content(url)
+        else:
+            # TODO: use content-type header instead of url
+            is_image = False
+            for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp', ]:
+                if url.endswith(ext):
+                    is_image = True
+                    break
+
+            if is_image:
+                self.log('using image_content (' + url + ')')
+                content = u'<html><body><img src="' + url + u'"></body></html>'
+            else:
+                content = self.get_readable_content(url)
+
+            article = 0
+            for a in self.hn_articles:
+                if a.url == url:
+                    article = a
+
+        # content = re.sub(r'</body>\s*</html>\s*$', '', content) + \
+        #    article.summary + '</body></html>'
+
+        if not isinstance(content, bytes):
+            content = content.encode('utf-8')
+        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
+        self.temp_files[-1].write(content)
+        self.temp_files[-1].close()
+        return self.temp_files[-1].name
+
+    def is_link_wanted(self, url, tag):
+        if url.endswith('.pdf'):
+            return False
+        return True
+
+    def prettyify_url(self, url):
+        return urlparse(url).hostname
+
+    def populate_article_metadata(self, article, soup, first):
+        article.text_summary = self.prettyify_url(article.url)
+        article.summary = article.text_summary
+
+#    def parse_index(self):
+#        feeds = []
+#        feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'https://news.ycombinator.com/item?id=2935944'}]))
+#        return feeds

From bc96f06341412144f4e6f7317101beb336e55ae8 Mon Sep 17 00:00:00 2001
From: Saurabh Nanda <saurabh@vacationlabs.com>
Date: Wed, 4 Sep 2024 12:58:13 +0530
Subject: [PATCH 2/3] Delete recipes/hackernews_with_comments.recipe

---
 recipes/hackernews_with_comments.recipe | 149 ------------------------
 1 file changed, 149 deletions(-)
 delete mode 100644 recipes/hackernews_with_comments.recipe

diff --git a/recipes/hackernews_with_comments.recipe b/recipes/hackernews_with_comments.recipe
deleted file mode 100644
index 951a65fd78..0000000000
--- a/recipes/hackernews_with_comments.recipe
+++ /dev/null
@@ -1,149 +0,0 @@
-#!/usr/bin/env python
-
-__license__ = 'GPL v3'
-'''
-Hacker News (with comments)
-'''
-from calibre.ptempfile import PersistentTemporaryFile
-from calibre.web.feeds.news import BasicNewsRecipe
-
-try:
-    from urllib.parse import urlparse
-except ImportError:
-    from urlparse import urlparse
-import re
-
-
-class HNWithComments(BasicNewsRecipe):
-    title = 'HN With actual comments'
-    __author__ = 'Tom Scholl & David Kerschner'
-    description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
-    publisher = 'Y Combinator'
-    category = 'news, programming, it, technology'
-    delay = 1
-    max_articles_per_feed = 20
-    oldest_article = 3
-    use_embedded_content = False
-    no_stylesheets = True
-    encoding = 'utf-8'
-    language = 'en'
-    requires_version = (0, 8, 16)
-
-    feeds = [
-        (u'Hacker News Frontpage', 'https://hnrss.org/frontpage'),
-        (u'Ask Hacker News', 'https://hnrss.org/ask')
-    ]
-
-    temp_files = []
-    articles_are_obfuscated = True
-
-    def get_readable_content(self, url):
-        self.log('get_readable_content(' + url + ')')
-        br = self.get_browser()
-        f = br.open(url)
-        html = f.read()
-        f.close()
-
-        return self.extract_readable_article(html, url)
-
-    def get_hn_content(self, url):
-        self.log('get_hn_content(' + url + ')')
-        soup = self.index_to_soup(url)
-        main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td
-
-        title_element = main.select('td.title .titleline a')[0]
-        self.log('title_element=' + repr(title_element))
-        title = self.tag_to_string(title_element)
-        self.log('title=' + title)
-        link = title_element['href']
-        # link = main.find('td', 'title').find('a')['href']
-        if link.startswith('item?'):
-            link = 'https://news.ycombinator.com/' + link
-        readable_link = link.rpartition('http://')[2].rpartition('https://')[2]
-        subtext = self.tag_to_string(main.find('td', 'subtext'))
-
-        title_content_td = main.find('td', 'title').findParent(
-            'tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1]
-        title_content = u''
-        if not title_content_td.find('form'):
-            title_content_td.name = 'div'
-            title_content = title_content_td.prettify()
-
-        comments = u''
-        for td in main.findAll('td', 'default'):
-            comhead = td.find('span', 'comhead')
-            if comhead:
-                com_title = u'<h4>' + \
-                    self.tag_to_string(comhead).replace(
-                        ' | link', '') + u'</h4>'
-                comhead.parent.extract()
-                br = td.find('br')
-                if br:
-                    br.extract()
-                reply = td.find('a', attrs={'href': re.compile('^reply?')})
-                if reply:
-                    reply.parent.extract()
-                td.name = 'div'
-                indent_width = (int(td.parent.find('td').img['width']) * 2) / 3
-                td['style'] = 'padding-left: ' + str(indent_width) + 'px'
-                comments = comments + com_title + td.prettify()
-
-        body = u'<h3>' + title + u'</h3><p><a href="' + link + u'">' + readable_link + \
-            u'</a><br/><strong>' + subtext + u'</strong></p>' + title_content + u'<br/>'
-        body = body + comments
-        return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
-
-    def parse_feeds(self):
-        a = super(HNWithCommentsLinkAlt, self).parse_feeds()
-        self.hn_articles = a[0].articles
-        return a
-
-    def get_obfuscated_article(self, url):
-        self.log('get_obfuscated_article with url=' + url)
-        if url.startswith('https://news.ycombinator.com'):
-            content = self.get_hn_content(url)
-        else:
-            # TODO: use content-type header instead of url
-            is_image = False
-            for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp', ]:
-                if url.endswith(ext):
-                    is_image = True
-                    break
-
-            if is_image:
-                self.log('using image_content (' + url + ')')
-                content = u'<html><body><img src="' + url + u'"></body></html>'
-            else:
-                content = self.get_readable_content(url)
-
-            article = 0
-            for a in self.hn_articles:
-                if a.url == url:
-                    article = a
-
-        # content = re.sub(r'</body>\s*</html>\s*$', '', content) + \
-        #    article.summary + '</body></html>'
-
-        if not isinstance(content, bytes):
-            content = content.encode('utf-8')
-        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
-        self.temp_files[-1].write(content)
-        self.temp_files[-1].close()
-        return self.temp_files[-1].name
-
-    def is_link_wanted(self, url, tag):
-        if url.endswith('.pdf'):
-            return False
-        return True
-
-    def prettyify_url(self, url):
-        return urlparse(url).hostname
-
-    def populate_article_metadata(self, article, soup, first):
-        article.text_summary = self.prettyify_url(article.url)
-        article.summary = article.text_summary
-
-#    def parse_index(self):
-#        feeds = []
-#        feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'https://news.ycombinator.com/item?id=2935944'}]))
-#        return feeds

From 889a03008ce3deb8527bf68f7fe7c4019cf64173 Mon Sep 17 00:00:00 2001
From: Saurabh Nanda <saurabh@vacationlabs.com>
Date: Wed, 4 Sep 2024 12:59:12 +0530
Subject: [PATCH 3/3] Bugfix in hackernews.recipe related to extracing comments

---
 recipes/hackernews.recipe | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe
index bae34b183b..e7bf7d690a 100644
--- a/recipes/hackernews.recipe
+++ b/recipes/hackernews.recipe
@@ -15,13 +15,14 @@ import re
 
 
 class HNWithCommentsLink(BasicNewsRecipe):
-    title = 'HN With Comments Link'
+    title = 'HN With Actual Comments'
     __author__ = 'Tom Scholl & David Kerschner'
     description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
     publisher = 'Y Combinator'
     category = 'news, programming, it, technology'
     delay = 1
-    max_articles_per_feed = 30
+    max_articles_per_feed = 20
+    oldest_article = 3
     use_embedded_content = False
     no_stylesheets = True
     encoding = 'utf-8'
@@ -29,7 +30,8 @@ class HNWithCommentsLink(BasicNewsRecipe):
     requires_version = (0, 8, 16)
 
     feeds = [
-        (u'Hacker News', 'https://news.ycombinator.com/rss')
+        (u'Hacker News Frontpage', 'https://hnrss.org/frontpage'),
+        (u'Ask Hacker News', 'https://hnrss.org/ask')
     ]
 
     temp_files = []
@@ -49,8 +51,10 @@ class HNWithCommentsLink(BasicNewsRecipe):
         soup = self.index_to_soup(url)
         main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td
 
-        title = self.tag_to_string(main.find('td', 'title'))
-        link = main.find('td', 'title').find('a')['href']
+        title_element = main.select('td.title .titleline a')[0]
+        title = self.tag_to_string(title_element)
+        link = title_element['href']
+        # link = main.find('td', 'title').find('a')['href']
         if link.startswith('item?'):
             link = 'https://news.ycombinator.com/' + link
         readable_link = link.rpartition('http://')[2].rpartition('https://')[2]
@@ -88,11 +92,12 @@ class HNWithCommentsLink(BasicNewsRecipe):
         return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
 
     def parse_feeds(self):
-        a = super(HNWithCommentsLink, self).parse_feeds()
+        a = super(HNWithCommentsLinkAlt, self).parse_feeds()
         self.hn_articles = a[0].articles
         return a
 
     def get_obfuscated_article(self, url):
+        self.log('get_obfuscated_article with url=' + url)
         if url.startswith('https://news.ycombinator.com'):
             content = self.get_hn_content(url)
         else:
@@ -114,8 +119,8 @@ class HNWithCommentsLink(BasicNewsRecipe):
                 if a.url == url:
                     article = a
 
-        content = re.sub(r'</body>\s*</html>\s*$', '', content) + \
-            article.summary + '</body></html>'
+        # content = re.sub(r'</body>\s*</html>\s*$', '', content) + \
+        #    article.summary + '</body></html>'
 
         if not isinstance(content, bytes):
             content = content.encode('utf-8')