Fix #4474 (Note of page in epub/picture in mobi)

2025-08-11 09:13:57 -04:00 · 2010-01-10 15:40:13 -07:00 · 2010-01-10 15:40:13 -07:00 · 20fe1609e4
commit 20fe1609e4
parent dbbed21599
2 changed files with 150 additions and 145 deletions
--- a/resources/recipes/wapo_cartoons.recipe
+++ b/resources/recipes/wapo_cartoons.recipe
@ -1,145 +1,145 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from datetime import date, timedelta
+from datetime import date, timedelta
-
+
-class WaPoCartoonsRecipe(BasicNewsRecipe):
+class WaPoCartoonsRecipe(BasicNewsRecipe):
-    __license__   = 'GPL v3'
+    __license__   = 'GPL v3'
-    __author__ = 'kwetal'
+    __author__ = 'kwetal'
-    language = 'en'
+    language = 'en'
-    version = 2
+    version = 2
-
+
-    title = u'Washington Post Cartoons'
+    title = u'Washington Post Cartoons'
-    publisher = u'Washington Post'
+    publisher = u'Washington Post'
-    category = u'News, Cartoons'
+    category = u'News, Cartoons'
-    description = u'Cartoons from the Washington Post'
+    description = u'Cartoons from the Washington Post'
-
+
-    oldest_article = 7
+    oldest_article = 7
-    max_articles_per_feed = 100
+    max_articles_per_feed = 100
-    use_embedded_content = False
+    use_embedded_content = False
-    no_stylesheets = True
+    no_stylesheets = True
-
+
-    feeds = []
+    feeds = []
-    feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/'))
+    feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/'))
-    feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/'))
+    feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/'))
-    feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb'))
+    feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb'))
-    feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/'))
+    feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/'))
-    feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk'))
+    feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk'))
-    feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/'))
+    feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/'))
-    feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/'))
+    feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/'))
-    feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/'))
+    feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/'))
-    feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/'))
+    feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/'))
-
+
-    extra_css = '''
+    extra_css = '''
-                body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
+                body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
-                h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;}
+                h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;}
-                #name {margin-bottom: 0.2em}
+                #name {margin-bottom: 0.2em}
-                #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;}
+                #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;}
-                '''
+                '''
-
+
-    def parse_index(self):
+    def parse_index(self):
-        index = []
+        index = []
-        oldestDate = date.today() - timedelta(days = self.oldest_article)
+        oldestDate = date.today() - timedelta(days = self.oldest_article)
-        oldest = oldestDate.strftime('%Y%m%d')
+        oldest = oldestDate.strftime('%Y%m%d')
-        for feed in self.feeds:
+        for feed in self.feeds:
-            cartoons = []
+            cartoons = []
-            soup = self.index_to_soup(feed[1])
+            soup = self.index_to_soup(feed[1])
-
+
-            cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''}
+            cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''}
-            cartoons.append(cartoon)
+            cartoons.append(cartoon)
-
+
-            select = soup.find('select', attrs = {'name': ['url', 'dest']})
+            select = soup.find('select', attrs = {'name': ['url', 'dest']})
-            if select:
+            if select:
-                cartoonCandidates = []
+                cartoonCandidates = []
-                if select['name'] == 'url':
+                if select['name'] == 'url':
-                    cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest)
+                    cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest)
-                else:
+                else:
-                    cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest)
+                    cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest)
-
+
-                for cartoon in cartoonCandidates:
+                for cartoon in cartoonCandidates:
-                    cartoons.append(cartoon)
+                    cartoons.append(cartoon)
-
+
-            index.append([feed[0], cartoons])
+            index.append([feed[0], cartoons])
-
+
-        return index
+        return index
-
+
-    def preprocess_html(self, soup):
+    def preprocess_html(self, soup):
-        freshSoup = self.getFreshSoup(soup)
+        freshSoup = self.getFreshSoup(soup)
-
+
-        div = soup.find('div', attrs = {'id': 'name'})
+        div = soup.find('div', attrs = {'id': 'name'})
-        if div:
+        if div:
-            freshSoup.body.append(div)
+            freshSoup.body.append(div)
-            comic = soup.find('div', attrs = {'id': 'comic_full'})
+            comic = soup.find('div', attrs = {'id': 'comic_full'})
-
+
-            img = comic.find('img')
+            img = comic.find('img')
-            if '&' in img['src']:
+            if '&' in img['src']:
-                img['src'], sep, bad = img['src'].rpartition('&')
+                img['src'], sep, bad = img['src'].rpartition('&')
-
+
-            freshSoup.body.append(comic)
+            freshSoup.body.append(comic)
-            freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'}))
+            freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'}))
-        else:
+        else:
-            span = soup.find('span', attrs = {'class': 'title'})
+            span = soup.find('span', attrs = {'class': 'title'})
-            if span:
+            if span:
-                del span['class']
+                del span['class']
-                span['id'] = 'name'
+                span['id'] = 'name'
-                span.name = 'div'
+                span.name = 'div'
-                freshSoup.body.append(span)
+                freshSoup.body.append(span)
-
+
-            img = soup.find('img', attrs = {'class': 'pic_big'})
+            img = soup.find('img', attrs = {'class': 'pic_big'})
-            if img:
+            if img:
-                td = img.parent
+                td = img.parent
-                if td.has_key('style'):
+                if td.has_key('style'):
-                    del td['style']
+                    del td['style']
-                td.name = 'div'
+                td.name = 'div'
-                td['id'] = 'comic_full'
+                td['id'] = 'comic_full'
-                freshSoup.body.append(td)
+                freshSoup.body.append(td)
-
+
-            td = soup.find('td', attrs = {'class': 'copy'})
+            td = soup.find('td', attrs = {'class': 'copy'})
-            if td:
+            if td:
-                for a in td.find('a'):
+                for a in td.find('a'):
-                    a.extract()
+                    a.extract()
-                del td['class']
+                del td['class']
-                td['id'] = 'copyright'
+                td['id'] = 'copyright'
-                td.name = 'div'
+                td.name = 'div'
-                freshSoup.body.append(td)
+                freshSoup.body.append(td)
-
+
-        return freshSoup
+        return freshSoup
-
+
-    def getFreshSoup(self, oldSoup):
+    def getFreshSoup(self, oldSoup):
-        freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
+        freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
-        if oldSoup.head.title:
+        if oldSoup.head.title:
-            freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
+            freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
-        return freshSoup
+        return freshSoup
-
+
-    def cartoonCandidatesWaPo(self, select, oldest):
+    def cartoonCandidatesWaPo(self, select, oldest):
-        opts = select.findAll('option')
+        opts = select.findAll('option')
-        for i in range(1, len(opts)):
+        for i in range(1, len(opts)):
-            url = opts[i]['value'].rstrip('/')
+            url = opts[i]['value'].rstrip('/')
-            dateparts = url.split('/')[-3:]
+            dateparts = url.split('/')[-3:]
-            datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2])
+            datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2])
-            if datenum >= oldest:
+            if datenum >= oldest:
-                yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''}
+                yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''}
-            else:
+            else:
-                return
+                return
-
+
-    def cartoonCandidatesCreatorsCom(self, select, oldest):
+    def cartoonCandidatesCreatorsCom(self, select, oldest):
-        monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
+        monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
-                      'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
+                      'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
-                      'November': '11', 'December': '12'}
+                      'November': '11', 'December': '12'}
-
+
-        opts = select.findAll('option')
+        opts = select.findAll('option')
-        for i in range(1, len(opts)):
+        for i in range(1, len(opts)):
-            if opts[i].has_key('selected'):
+            if opts[i].has_key('selected'):
-                continue
+                continue
-
+
-            dateString = self.tag_to_string(opts[i])
+            dateString = self.tag_to_string(opts[i])
-            rest, sep, year = dateString.rpartition(', ')
+            rest, sep, year = dateString.rpartition(', ')
-            parts = rest.split(' ')
+            parts = rest.split(' ')
-            day = parts[2].rjust(2, '0')
+            day = parts[2].rjust(2, '0')
-            month = monthNames[parts[1]]
+            month = monthNames[parts[1]]
-            datenum = str(year) + month + str(day)
+            datenum = str(year) + month + str(day)
-            if datenum >= oldest:
+            if datenum >= oldest:
-                yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''}
+                yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''}
-            else:
+            else:
-                return
+                return
-
+
-
+
--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@ -264,6 +264,11 @@ class EPUBOutput(OutputFormatPlugin):
            if body:
                body = body[0]
            # Add id attribute to <a> tags that have name
            for x in XPath('//h:a[@name]')(body):
                if not x.get('id', False):
                    x.set('id', x.get('name'))
            # Replace <br> that are children of <body> as ADE doesn't handle them
            if hasattr(body, 'xpath'):
                for br in XPath('./h:br')(body):