Allow recipes to specify overrides for conversion options

2025-07-31 14:33:54 -04:00 · 2009-05-13 19:20:47 -07:00 · 2009-05-13 19:20:47 -07:00 · 2e0ad5d1e0
commit 2e0ad5d1e0
parent 9b170e6c95
5 changed files with 74 additions and 68 deletions
--- a/src/calibre/manual/news_recipe.rst
+++ b/src/calibre/manual/news_recipe.rst
@ -54,6 +54,8 @@ Customizing e-book download
 .. automember:: BasicNewsRecipe.timefmt
 .. automember:: basicNewsRecipe.conversion_options
 .. automember:: BasicNewsRecipe.feeds
 .. automember:: BasicNewsRecipe.no_stylesheets
--- a/src/calibre/web/feeds/input.py
+++ b/src/calibre/web/feeds/input.py
@ -57,6 +57,8 @@ class RecipeInput(InputFormatPlugin):
        ro = recipe(opts, log, self.report_progress)
        ro.download()
        for key, val in recipe.conversion_options.items():
            setattr(opts, key, val)
        opts.output_profile.flow_size = 0
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -156,13 +156,16 @@ class BasicNewsRecipe(Recipe):
    #: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
    filter_regexps        = []
-    #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
+    #: Recipe specific options to control the conversion of the downloaded
-    html2lrf_options      = []
+    #: content into an e-book. These will override any user or plugin specified
-
+    #: values, so only use if absolutely necessary. For example:
-    #: Options to pass to html2epub to customize generation of EPUB ebooks.
+    #: conversion_options = {
-    html2epub_options     = ''
+    #:   'base_font_size' : 16,
-    #: Options to pass to oeb2mobi to customize generation of MOBI ebooks.
+    #:   'tags' : 'mytag1,mytag2',
-    oeb2mobi_options     = ''
+    #:   'title' : 'My Title',
    #:   'linearize_tables' : True,
    #: }
    conversion_options = {}
    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
    #: A tag is specified as a dictionary of the form::
--- a/src/calibre/web/feeds/recipes/recipe_barrons.py
+++ b/src/calibre/web/feeds/recipes/recipe_barrons.py
@ -1,76 +1,76 @@
 ##
-##    web2lrf profile to download articles from Barrons.com 
+##    web2lrf profile to download articles from Barrons.com
-##    can download subscriber-only content if username and  
+##    can download subscriber-only content if username and
 ##    password are supplied.
 ##
-''' 
+'''
-''' 
+'''
- 
+
-import re 
+import re
- 
+
-from calibre.web.feeds.news import BasicNewsRecipe  
+from calibre.web.feeds.news import BasicNewsRecipe
-         
+
-class Barrons(BasicNewsRecipe): 
+class Barrons(BasicNewsRecipe):
-    
+
-        title = 'Barron\'s' 
+        title = 'Barron\'s'
        max_articles_per_feed = 50
        needs_subscription    = True
        language = _('English')
        __author__ = 'Kovid Goyal'
        description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
-        timefmt  = ' [%a, %b %d, %Y]' 
+        timefmt  = ' [%a, %b %d, %Y]'
-        use_embedded_content   = False 
+        use_embedded_content   = False
        no_stylesheets = False
        match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
-        html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
+        conversion_options = {'linearize_tables': True}
        ##delay = 1
-        
+
-        ## Don't grab articles more than 7 days old 
+        ## Don't grab articles more than 7 days old
-        oldest_article = 7 
+        oldest_article = 7
-        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  
+        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
-                [ 
+                [
-                ## Remove anything before the body of the article. 
+                ## Remove anything before the body of the article.
-                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), 
+                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
                ## Remove any insets from the body of the article. 
                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), 
-                ## Remove any reprint info from the body of the article. 
+                ## Remove any insets from the body of the article.
-                (r'<hr size.*?<p', lambda match : '<p'), 
+                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
-                ## Remove anything after the end of the article. 
+                ## Remove any reprint info from the body of the article.
-                (r'<!-- article end.*?</body>', lambda match : '</body>'), 
+                (r'<hr size.*?<p', lambda match : '<p'),
-                ] 
+
-        ] 
+                ## Remove anything after the end of the article.
- 
+                (r'<!-- article end.*?</body>', lambda match : '</body>'),
-        def get_browser(self): 
+                ]
-            br = BasicNewsRecipe.get_browser() 
+        ]
-            if self.username is not None and self.password is not None: 
+
-                br.open('http://commerce.barrons.com/auth/login') 
+        def get_browser(self):
-                br.select_form(name='login_form') 
+            br = BasicNewsRecipe.get_browser()
-                br['user']   = self.username 
+            if self.username is not None and self.password is not None:
-                br['password'] = self.password 
+                br.open('http://commerce.barrons.com/auth/login')
-                br.submit() 
+                br.select_form(name='login_form')
-            return br 
+                br['user']   = self.username
- 
+                br['password'] = self.password
-## Use the print version of a page when available. 
+                br.submit()
- 
+            return br
-        def print_version(self, url): 
+
-                return url.replace('/article/', '/article_print/') 
+## Use the print version of a page when available.
- 
+
-## Comment out the feeds you don't want retrieved. 
+        def print_version(self, url):
-## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire 
+                return url.replace('/article/', '/article_print/')
- 
+
-        def get_feeds(self): 
+## Comment out the feeds you don't want retrieved.
-                return  [ 
+## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
-                ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'), 
+
-                ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'), 
+        def get_feeds(self):
-                ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'), 
+                return  [
-                ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'), 
+                ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
-                ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'), 
+                ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
-                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), 
+                ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
                ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
                ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
                ]
        ## Logout of website
--- a/src/calibre/web/feeds/recipes/recipe_winsupersite.py
+++ b/src/calibre/web/feeds/recipes/recipe_winsupersite.py
@ -13,8 +13,7 @@ class Winsupersite(BasicNewsRecipe):
    no_stylesheets        = True
    use_embedded_content  = False
    remove_javascript     = True
-    html2lrf_options = ['--ignore-tables']
+    conversion_options = {'linearize_tables' : True}
    html2epub_options = 'linearize_tables = True'
    remove_tags_before = dict(name='h1')
    preprocess_regexps = [
   (re.compile(r'<p>--Paul Thurrott.*</body>', re.DOTALL|re.IGNORECASE),
@ -24,5 +23,5 @@ class Winsupersite(BasicNewsRecipe):
        br = BasicNewsRecipe.get_browser()
        br.open('http://www.winsupersite.com')
        return br
-    
+
    feeds          = [(u'Supersite for Windows', u'http://www.winsupersite.com/supersite.xml')]