Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-12-08 06:05:04 -05:00 · 2024-09-25 16:24:52 +05:30 · 2024-09-25 16:24:52 +05:30 · f08750b33c
commit f08750b33c
parent 1016e133ed 9e0506aaad
7 changed files with 138 additions and 67 deletions
--- a/recipes/icons/iht.png
+++ b/recipes/icons/iht.png
--- a/recipes/icons/nytimes.png
+++ b/recipes/icons/nytimes.png
--- a/recipes/iht.recipe
+++ b/recipes/iht.recipe
@ -1,30 +0,0 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class NYTimesGlobal(BasicNewsRecipe):
    title = u'NY Times Global'
    language = 'en'
    __author__ = 'Krittika Goyal'
    oldest_article = 1  # days
    max_articles_per_feed = 25
    use_embedded_content = False
    no_stylesheets = True
    auto_cleanup = True
    feeds = [
        ('NYTimes',
         'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'),
        ('NYTimes global',
         'http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml'),
        ('World',
         'http://www.nytimes.com/services/xml/rss/nyt/World.xml'),
        ('U.S.',
         'http://www.nytimes.com/services/xml/rss/nyt/US.xml'),
        ('Business',
         'http://feeds.nytimes.com/nyt/rss/Business'),
        ('Sports',
         'http://www.nytimes.com/services/xml/rss/nyt/Sports.xml'),
        ('Technology',
         'http://feeds.nytimes.com/nyt/rss/Technology'),
    ]
--- a/recipes/nytfeeds.recipe
+++ b/recipes/nytfeeds.recipe
@ -1,9 +1,8 @@
 #!/usr/bin/env python
 import json
 import re
 import time
 from datetime import datetime, timedelta
 from calibre.utils.iso8601 import parse_iso8601
 from calibre.web.feeds.news import BasicNewsRecipe
@ -66,7 +65,7 @@ def parse_byline(byl):
    yield '</i></b></div>'
 def iso_date(x):
-    dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
+    dt = parse_iso8601(x, as_utc=False)
    return dt.strftime('%b %d, %Y at %I:%M %p')
 def parse_header(h):
@ -138,7 +137,7 @@ def parse_types(x):
    elif x.get('__typename', '') == 'Image':
        yield ''.join(parse_image(x))
    elif x.get('__typename', '') == 'ImageBlock':
-        yield ''.join(parse_image(x['media']))
+        yield ''.join(parse_types(x['media']))
    elif x.get('__typename', '') == 'GridBlock':
        yield ''.join(parse_img_grid(x))
@ -265,6 +264,8 @@ class nytFeeds(BasicNewsRecipe):
        'https://rss.nytimes.com/services/xml/rss/nyt/tmagazine.xml',
        'https://rss.nytimes.com/services/xml/rss/nyt/books.xml',
        'https://www.nytimes.com/services/xml/rss/nyt/Travel.xml',
        'https://rss.nytimes.com/services/xml/rss/nyt/well.xml',
        'https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml',
        'http://nytimes.com/timeswire/feeds/'
    ]
@ -301,5 +302,6 @@ class nytFeeds(BasicNewsRecipe):
    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        # you can remove '|/espanol/' from code below to include spanish articles.
-        if not re.search(r'/video/|/live/|/athletic/|/espanol/', url):
+        if not re.search(r'/video/|/live/|/athletic/|/espanol/|/card/', url):
            return url
        self.log('\tSkipped URL: ', url)
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -14,9 +14,7 @@ from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
-is_web_edition = True
+use_wayback_machine = False
 oldest_web_edition_article = 7  # days
 use_wayback_machine = True
 # The sections to download when downloading the web edition, comment out
@ -77,22 +75,28 @@ def new_tag(soup, name, attrs=()):
 class NewYorkTimes(BasicNewsRecipe):
-
+    title = 'The New York Times (Web)'
-    if is_web_edition:
+    description = (
-        title = 'The New York Times (Web)'
+        'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
-        description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
+        'Use advanced menu to make changes to fetch Todays Paper'
-    else:
+    )
        title = 'The New York Times'
        description = 'Today\'s New York Times'
    encoding = 'utf-8'
    __author__ = 'Kovid Goyal'
-    language = 'en'
+    language = 'en_US'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
-    compress_news_images = True
+    is_web_edition = True
-    compress_news_images_auto_size = 5
+    oldest_web_edition_article = 7  # days
-    conversion_options = {'flow_size': 0}
+
-    delay = 0 if use_wayback_machine else 1
+    extra_css = '''
        .byl, .time { font-size:small; color:#202020; }
        .cap { font-size:small; text-align:center; }
        .cred { font-style:italic; font-size:small; }
        em, blockquote { color: #202020; }
        .sc { font-variant: small-caps; }
        .lbl { font-size:small; color:#404040; }
        img { display:block; margin:0 auto; }
    '''
    @property
    def nyt_parser(self):
@ -106,9 +110,13 @@ class NewYorkTimes(BasicNewsRecipe):
        if use_wayback_machine and not skip_wayback:
            from calibre import browser
            return self.nyt_parser.download_url(url, browser())
-        return self.browser.open_novisit(url).read()
+        return self.index_to_soup(url, raw=True)
    def preprocess_raw_html(self, raw_html, url):
        if '/interactive/' in url:
            return '<html><body><p><em>'\
                + 'This is an interactive article, which is supposed to be read in a browser.'\
                    + '</p></em></body></html>'
        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html
@ -121,9 +129,51 @@ class NewYorkTimes(BasicNewsRecipe):
                tf.write(self.get_nyt_page(url))
            return tf.name
    recipe_specific_options = {
        'web': {
            'short': 'Type in yes, if you want Todays Paper',
            'default': 'Web Edition'
        },
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)',
            'default': str(oldest_web_edition_article)
        },
        'date': {
            'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper',
            'long': 'For example, 2024/07/16'
        },
        'res': {
            'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo',
            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use articleInline.',
        },
        'comp': {
            'short': 'Compress News Images?',
            'long': 'enter yes',
            'default': 'no'
        }
    }
    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        c = self.recipe_specific_options.get('comp')
        d = self.recipe_specific_options.get('days')
        w = self.recipe_specific_options.get('web')
        if w and isinstance(w, str):
            if w == 'yes':
                self.is_web_edition = False
        if d and isinstance(d, str):
            self.oldest_web_edition_article = float(d)
        if c and isinstance(c, str):
            if c.lower() == 'yes':
                self.compress_news_images = True
    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
        d = self.recipe_specific_options.get('date')
        if d and isinstance(d, str):
            INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times'
        return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))
    def read_nyt_metadata(self):
@ -219,7 +269,7 @@ class NewYorkTimes(BasicNewsRecipe):
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
-                        if delta.days > oldest_web_edition_article:
+                        if delta.days > self.oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}
@ -242,7 +292,7 @@ class NewYorkTimes(BasicNewsRecipe):
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
-                        if delta.days > oldest_web_edition_article:
+                        if delta.days > self.oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}
@ -290,6 +340,34 @@ class NewYorkTimes(BasicNewsRecipe):
        # return [('All articles', [
        #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
        # ])]
-        if is_web_edition:
+        if self.is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()
    def get_browser(self, *args, **kwargs):
        kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        br.addheaders += [
            ('Referer', 'https://www.google.com/'),
            ('X-Forwarded-For', '66.249.66.1')
        ]
        return br
    def preprocess_html(self, soup):
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = '-' + w
            for img in soup.findAll('img', attrs={'src':True}):
                if '-article' in img['src']:
                    ext = img['src'].split('?')[0].split('.')[-1]
                    img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
        for c in soup.findAll('div', attrs={'class':'cap'}):
            for p in c.findAll(['p', 'div']):
                p.name = 'span'
        return soup
    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        if not re.search(r'/video/|/athletic/|/card/', url):
            return url
        self.log('\tSkipping ', url)
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -14,8 +14,6 @@ from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
 is_web_edition = False
 oldest_web_edition_article = 7  # days
 use_wayback_machine = False
@ -77,18 +75,18 @@ def new_tag(soup, name, attrs=()):
 class NewYorkTimes(BasicNewsRecipe):
-
+    title = 'The New York Times'
-    if is_web_edition:
+    description = (
-        title = 'The New York Times (Web)'
+        'New York Times. Todays Paper '
-        description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
+        'Use advanced menu to make changes to fetch Web Edition'
-    else:
+    )
        title = 'The New York Times'
        description = 'Today\'s New York Times'
    encoding = 'utf-8'
    __author__ = 'Kovid Goyal'
    language = 'en_US'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    is_web_edition = False
    oldest_web_edition_article = 7  # days
    extra_css = '''
        .byl, .time { font-size:small; color:#202020; }
@ -132,8 +130,17 @@ class NewYorkTimes(BasicNewsRecipe):
            return tf.name
    recipe_specific_options = {
        'web': {
            'short': 'Type in yes, if you want Web Edition',
            'default': 'Todays Paper'
        },
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)',
            'default': str(oldest_web_edition_article)
        },
        'date': {
-            'short': 'The date of the edition to download (YYYY/MM/DD format)',
+            'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper',
            'long': 'For example, 2024/07/16'
        },
        'res': {
@ -150,6 +157,13 @@ class NewYorkTimes(BasicNewsRecipe):
    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        c = self.recipe_specific_options.get('comp')
        d = self.recipe_specific_options.get('days')
        w = self.recipe_specific_options.get('web')
        if w and isinstance(w, str):
            if w == 'yes':
                self.is_web_edition = True
        if d and isinstance(d, str):
            self.oldest_web_edition_article = float(d)
        if c and isinstance(c, str):
            if c.lower() == 'yes':
                self.compress_news_images = True
@ -255,7 +269,7 @@ class NewYorkTimes(BasicNewsRecipe):
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
-                        if delta.days > oldest_web_edition_article:
+                        if delta.days > self.oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}
@ -278,7 +292,7 @@ class NewYorkTimes(BasicNewsRecipe):
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
-                        if delta.days > oldest_web_edition_article:
+                        if delta.days > self.oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}
@ -326,7 +340,7 @@ class NewYorkTimes(BasicNewsRecipe):
        # return [('All articles', [
        #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
        # ])]
-        if is_web_edition:
+        if self.is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()
@ -351,3 +365,8 @@ class NewYorkTimes(BasicNewsRecipe):
            for p in c.findAll(['p', 'div']):
                p.name = 'span'
        return soup
    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        if not re.search(r'/video/|/athletic/', url):
            return url
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -1876,6 +1876,8 @@ class BasicNewsRecipe(Recipe):
                        if articles:
                            arelpath = sorted(articles, key=numeric_sort_key)[0]
                            a.set('href', item.relhref(arelpath))
                            if a.text and len(a) == 0:
                                a.text = a.text + '`'
                            if url not in seen:
                                log.debug(f'Resolved internal URL: {url} -> {arelpath}')
                                seen.add(url)