From aa2d5b8c5f72e474c115441873582128a9199f85 Mon Sep 17 00:00:00 2001 From: Saurabh Nanda Date: Wed, 4 Sep 2024 12:55:57 +0530 Subject: [PATCH 1/3] Create hackernews_with_comments.recipe --- recipes/hackernews_with_comments.recipe | 149 ++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 recipes/hackernews_with_comments.recipe diff --git a/recipes/hackernews_with_comments.recipe b/recipes/hackernews_with_comments.recipe new file mode 100644 index 0000000000..951a65fd78 --- /dev/null +++ b/recipes/hackernews_with_comments.recipe @@ -0,0 +1,149 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +''' +Hacker News (with comments) +''' +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse +import re + + +class HNWithComments(BasicNewsRecipe): + title = 'HN With actual comments' + __author__ = 'Tom Scholl & David Kerschner' + description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' + publisher = 'Y Combinator' + category = 'news, programming, it, technology' + delay = 1 + max_articles_per_feed = 20 + oldest_article = 3 + use_embedded_content = False + no_stylesheets = True + encoding = 'utf-8' + language = 'en' + requires_version = (0, 8, 16) + + feeds = [ + (u'Hacker News Frontpage', 'https://hnrss.org/frontpage'), + (u'Ask Hacker News', 'https://hnrss.org/ask') + ] + + temp_files = [] + articles_are_obfuscated = True + + def get_readable_content(self, url): + self.log('get_readable_content(' + url + ')') + br = self.get_browser() + f = br.open(url) + html = f.read() + f.close() + + return self.extract_readable_article(html, url) + + def get_hn_content(self, url): + self.log('get_hn_content(' + url + ')') + soup = self.index_to_soup(url) + main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td + + title_element = main.select('td.title .titleline a')[0] + self.log('title_element=' + repr(title_element)) + title = self.tag_to_string(title_element) + self.log('title=' + title) + link = title_element['href'] + # link = main.find('td', 'title').find('a')['href'] + if link.startswith('item?'): + link = 'https://news.ycombinator.com/' + link + readable_link = link.rpartition('http://')[2].rpartition('https://')[2] + subtext = self.tag_to_string(main.find('td', 'subtext')) + + title_content_td = main.find('td', 'title').findParent( + 'tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1] + title_content = u'' + if not title_content_td.find('form'): + title_content_td.name = 'div' + title_content = title_content_td.prettify() + + comments = u'' + for td in main.findAll('td', 'default'): + comhead = td.find('span', 'comhead') + if comhead: + com_title = u'

' + \ + self.tag_to_string(comhead).replace( + ' | link', '') + u'

' + comhead.parent.extract() + br = td.find('br') + if br: + br.extract() + reply = td.find('a', attrs={'href': re.compile('^reply?')}) + if reply: + reply.parent.extract() + td.name = 'div' + indent_width = (int(td.parent.find('td').img['width']) * 2) / 3 + td['style'] = 'padding-left: ' + str(indent_width) + 'px' + comments = comments + com_title + td.prettify() + + body = u'

' + title + u'

' + readable_link + \ + u'
' + subtext + u'

' + title_content + u'
' + body = body + comments + return u'' + title + u'' + body + '' + + def parse_feeds(self): + a = super(HNWithCommentsLinkAlt, self).parse_feeds() + self.hn_articles = a[0].articles + return a + + def get_obfuscated_article(self, url): + self.log('get_obfuscated_article with url=' + url) + if url.startswith('https://news.ycombinator.com'): + content = self.get_hn_content(url) + else: + # TODO: use content-type header instead of url + is_image = False + for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp', ]: + if url.endswith(ext): + is_image = True + break + + if is_image: + self.log('using image_content (' + url + ')') + content = u'' + else: + content = self.get_readable_content(url) + + article = 0 + for a in self.hn_articles: + if a.url == url: + article = a + + # content = re.sub(r'\s*\s*$', '', content) + \ + # article.summary + '' + + if not isinstance(content, bytes): + content = content.encode('utf-8') + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write(content) + self.temp_files[-1].close() + return self.temp_files[-1].name + + def is_link_wanted(self, url, tag): + if url.endswith('.pdf'): + return False + return True + + def prettyify_url(self, url): + return urlparse(url).hostname + + def populate_article_metadata(self, article, soup, first): + article.text_summary = self.prettyify_url(article.url) + article.summary = article.text_summary + +# def parse_index(self): +# feeds = [] +# feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'https://news.ycombinator.com/item?id=2935944'}])) +# return feeds From bc96f06341412144f4e6f7317101beb336e55ae8 Mon Sep 17 00:00:00 2001 From: Saurabh Nanda Date: Wed, 4 Sep 2024 12:58:13 +0530 Subject: [PATCH 2/3] Delete recipes/hackernews_with_comments.recipe --- recipes/hackernews_with_comments.recipe | 149 ------------------------ 1 file changed, 149 deletions(-) delete mode 100644 recipes/hackernews_with_comments.recipe diff --git a/recipes/hackernews_with_comments.recipe b/recipes/hackernews_with_comments.recipe deleted file mode 100644 index 951a65fd78..0000000000 --- a/recipes/hackernews_with_comments.recipe +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -''' -Hacker News (with comments) -''' -from calibre.ptempfile import PersistentTemporaryFile -from calibre.web.feeds.news import BasicNewsRecipe - -try: - from urllib.parse import urlparse -except ImportError: - from urlparse import urlparse -import re - - -class HNWithComments(BasicNewsRecipe): - title = 'HN With actual comments' - __author__ = 'Tom Scholl & David Kerschner' - description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' - publisher = 'Y Combinator' - category = 'news, programming, it, technology' - delay = 1 - max_articles_per_feed = 20 - oldest_article = 3 - use_embedded_content = False - no_stylesheets = True - encoding = 'utf-8' - language = 'en' - requires_version = (0, 8, 16) - - feeds = [ - (u'Hacker News Frontpage', 'https://hnrss.org/frontpage'), - (u'Ask Hacker News', 'https://hnrss.org/ask') - ] - - temp_files = [] - articles_are_obfuscated = True - - def get_readable_content(self, url): - self.log('get_readable_content(' + url + ')') - br = self.get_browser() - f = br.open(url) - html = f.read() - f.close() - - return self.extract_readable_article(html, url) - - def get_hn_content(self, url): - self.log('get_hn_content(' + url + ')') - soup = self.index_to_soup(url) - main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td - - title_element = main.select('td.title .titleline a')[0] - self.log('title_element=' + repr(title_element)) - title = self.tag_to_string(title_element) - self.log('title=' + title) - link = title_element['href'] - # link = main.find('td', 'title').find('a')['href'] - if link.startswith('item?'): - link = 'https://news.ycombinator.com/' + link - readable_link = link.rpartition('http://')[2].rpartition('https://')[2] - subtext = self.tag_to_string(main.find('td', 'subtext')) - - title_content_td = main.find('td', 'title').findParent( - 'tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1] - title_content = u'' - if not title_content_td.find('form'): - title_content_td.name = 'div' - title_content = title_content_td.prettify() - - comments = u'' - for td in main.findAll('td', 'default'): - comhead = td.find('span', 'comhead') - if comhead: - com_title = u'

' + \ - self.tag_to_string(comhead).replace( - ' | link', '') + u'

' - comhead.parent.extract() - br = td.find('br') - if br: - br.extract() - reply = td.find('a', attrs={'href': re.compile('^reply?')}) - if reply: - reply.parent.extract() - td.name = 'div' - indent_width = (int(td.parent.find('td').img['width']) * 2) / 3 - td['style'] = 'padding-left: ' + str(indent_width) + 'px' - comments = comments + com_title + td.prettify() - - body = u'

' + title + u'

' + readable_link + \ - u'
' + subtext + u'

' + title_content + u'
' - body = body + comments - return u'' + title + u'' + body + '' - - def parse_feeds(self): - a = super(HNWithCommentsLinkAlt, self).parse_feeds() - self.hn_articles = a[0].articles - return a - - def get_obfuscated_article(self, url): - self.log('get_obfuscated_article with url=' + url) - if url.startswith('https://news.ycombinator.com'): - content = self.get_hn_content(url) - else: - # TODO: use content-type header instead of url - is_image = False - for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp', ]: - if url.endswith(ext): - is_image = True - break - - if is_image: - self.log('using image_content (' + url + ')') - content = u'' - else: - content = self.get_readable_content(url) - - article = 0 - for a in self.hn_articles: - if a.url == url: - article = a - - # content = re.sub(r'\s*\s*$', '', content) + \ - # article.summary + '' - - if not isinstance(content, bytes): - content = content.encode('utf-8') - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(content) - self.temp_files[-1].close() - return self.temp_files[-1].name - - def is_link_wanted(self, url, tag): - if url.endswith('.pdf'): - return False - return True - - def prettyify_url(self, url): - return urlparse(url).hostname - - def populate_article_metadata(self, article, soup, first): - article.text_summary = self.prettyify_url(article.url) - article.summary = article.text_summary - -# def parse_index(self): -# feeds = [] -# feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'https://news.ycombinator.com/item?id=2935944'}])) -# return feeds From 889a03008ce3deb8527bf68f7fe7c4019cf64173 Mon Sep 17 00:00:00 2001 From: Saurabh Nanda Date: Wed, 4 Sep 2024 12:59:12 +0530 Subject: [PATCH 3/3] Bugfix in hackernews.recipe related to extracing comments --- recipes/hackernews.recipe | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe index bae34b183b..e7bf7d690a 100644 --- a/recipes/hackernews.recipe +++ b/recipes/hackernews.recipe @@ -15,13 +15,14 @@ import re class HNWithCommentsLink(BasicNewsRecipe): - title = 'HN With Comments Link' + title = 'HN With Actual Comments' __author__ = 'Tom Scholl & David Kerschner' description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' publisher = 'Y Combinator' category = 'news, programming, it, technology' delay = 1 - max_articles_per_feed = 30 + max_articles_per_feed = 20 + oldest_article = 3 use_embedded_content = False no_stylesheets = True encoding = 'utf-8' @@ -29,7 +30,8 @@ class HNWithCommentsLink(BasicNewsRecipe): requires_version = (0, 8, 16) feeds = [ - (u'Hacker News', 'https://news.ycombinator.com/rss') + (u'Hacker News Frontpage', 'https://hnrss.org/frontpage'), + (u'Ask Hacker News', 'https://hnrss.org/ask') ] temp_files = [] @@ -49,8 +51,10 @@ class HNWithCommentsLink(BasicNewsRecipe): soup = self.index_to_soup(url) main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td - title = self.tag_to_string(main.find('td', 'title')) - link = main.find('td', 'title').find('a')['href'] + title_element = main.select('td.title .titleline a')[0] + title = self.tag_to_string(title_element) + link = title_element['href'] + # link = main.find('td', 'title').find('a')['href'] if link.startswith('item?'): link = 'https://news.ycombinator.com/' + link readable_link = link.rpartition('http://')[2].rpartition('https://')[2] @@ -88,11 +92,12 @@ class HNWithCommentsLink(BasicNewsRecipe): return u'' + title + u'' + body + '' def parse_feeds(self): - a = super(HNWithCommentsLink, self).parse_feeds() + a = super(HNWithCommentsLinkAlt, self).parse_feeds() self.hn_articles = a[0].articles return a def get_obfuscated_article(self, url): + self.log('get_obfuscated_article with url=' + url) if url.startswith('https://news.ycombinator.com'): content = self.get_hn_content(url) else: @@ -114,8 +119,8 @@ class HNWithCommentsLink(BasicNewsRecipe): if a.url == url: article = a - content = re.sub(r'\s*\s*$', '', content) + \ - article.summary + '' + # content = re.sub(r'\s*\s*$', '', content) + \ + # article.summary + '' if not isinstance(content, bytes): content = content.encode('utf-8')