Allow recipes to specify overrides for conversion options

2026-05-28 09:42:33 -04:00 · 2009-05-13 19:20:47 -07:00
parent 9b170e6c95
commit 2e0ad5d1e0
5 changed files with 74 additions and 68 deletions
@@ -54,6 +54,8 @@ Customizing e-book download

 .. automember:: BasicNewsRecipe.timefmt

+.. automember:: basicNewsRecipe.conversion_options
+
 .. automember:: BasicNewsRecipe.feeds

 .. automember:: BasicNewsRecipe.no_stylesheets
@@ -57,6 +57,8 @@ class RecipeInput(InputFormatPlugin):

        ro = recipe(opts, log, self.report_progress)
        ro.download()
+        for key, val in recipe.conversion_options.items():
+            setattr(opts, key, val)

        opts.output_profile.flow_size = 0

@@ -156,13 +156,16 @@ class BasicNewsRecipe(Recipe):
    #: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
    filter_regexps        = []

-    #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
-    html2lrf_options      = []
-
-    #: Options to pass to html2epub to customize generation of EPUB ebooks.
-    html2epub_options     = ''
-    #: Options to pass to oeb2mobi to customize generation of MOBI ebooks.
-    oeb2mobi_options     = ''
+    #: Recipe specific options to control the conversion of the downloaded
+    #: content into an e-book. These will override any user or plugin specified
+    #: values, so only use if absolutely necessary. For example:
+    #: conversion_options = {
+    #:   'base_font_size' : 16,
+    #:   'tags' : 'mytag1,mytag2',
+    #:   'title' : 'My Title',
+    #:   'linearize_tables' : True,
+    #: }
+    conversion_options = {}

    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
    #: A tag is specified as a dictionary of the form::
@@ -1,76 +1,76 @@
 ##
-##    web2lrf profile to download articles from Barrons.com 
-##    can download subscriber-only content if username and  
+##    web2lrf profile to download articles from Barrons.com
+##    can download subscriber-only content if username and
 ##    password are supplied.
 ##
-''' 
-''' 
- 
-import re 
- 
-from calibre.web.feeds.news import BasicNewsRecipe  
-         
-class Barrons(BasicNewsRecipe): 
-    
-        title = 'Barron\'s' 
+'''
+'''
+
+import re
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Barrons(BasicNewsRecipe):
+
+        title = 'Barron\'s'
        max_articles_per_feed = 50
        needs_subscription    = True
        language = _('English')
        __author__ = 'Kovid Goyal'
        description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
-        timefmt  = ' [%a, %b %d, %Y]' 
-        use_embedded_content   = False 
+        timefmt  = ' [%a, %b %d, %Y]'
+        use_embedded_content   = False
        no_stylesheets = False
        match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
-        html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
+        conversion_options = {'linearize_tables': True}
        ##delay = 1
-        
-        ## Don't grab articles more than 7 days old 
-        oldest_article = 7 
+
+        ## Don't grab articles more than 7 days old
+        oldest_article = 7


-        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  
-                [ 
-                ## Remove anything before the body of the article. 
-                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), 
- 
-                ## Remove any insets from the body of the article. 
-                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), 
+        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
+                [
+                ## Remove anything before the body of the article.
+                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),

-                ## Remove any reprint info from the body of the article. 
-                (r'<hr size.*?<p', lambda match : '<p'), 
+                ## Remove any insets from the body of the article.
+                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),

-                ## Remove anything after the end of the article. 
-                (r'<!-- article end.*?</body>', lambda match : '</body>'), 
-                ] 
-        ] 
- 
-        def get_browser(self): 
-            br = BasicNewsRecipe.get_browser() 
-            if self.username is not None and self.password is not None: 
-                br.open('http://commerce.barrons.com/auth/login') 
-                br.select_form(name='login_form') 
-                br['user']   = self.username 
-                br['password'] = self.password 
-                br.submit() 
-            return br 
- 
-## Use the print version of a page when available. 
- 
-        def print_version(self, url): 
-                return url.replace('/article/', '/article_print/') 
- 
-## Comment out the feeds you don't want retrieved. 
-## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire 
- 
-        def get_feeds(self): 
-                return  [ 
-                ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'), 
-                ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'), 
-                ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'), 
-                ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'), 
-                ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'), 
-                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), 
+                ## Remove any reprint info from the body of the article.
+                (r'<hr size.*?<p', lambda match : '<p'),
+
+                ## Remove anything after the end of the article.
+                (r'<!-- article end.*?</body>', lambda match : '</body>'),
+                ]
+        ]
+
+        def get_browser(self):
+            br = BasicNewsRecipe.get_browser()
+            if self.username is not None and self.password is not None:
+                br.open('http://commerce.barrons.com/auth/login')
+                br.select_form(name='login_form')
+                br['user']   = self.username
+                br['password'] = self.password
+                br.submit()
+            return br
+
+## Use the print version of a page when available.
+
+        def print_version(self, url):
+                return url.replace('/article/', '/article_print/')
+
+## Comment out the feeds you don't want retrieved.
+## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
+
+        def get_feeds(self):
+                return  [
+                ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
+                ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
+                ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
+                ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
+                ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
+                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
                ]

        ## Logout of website
@@ -13,8 +13,7 @@ class Winsupersite(BasicNewsRecipe):
    no_stylesheets        = True
    use_embedded_content  = False
    remove_javascript     = True
-    html2lrf_options = ['--ignore-tables']
-    html2epub_options = 'linearize_tables = True'
+    conversion_options = {'linearize_tables' : True}
    remove_tags_before = dict(name='h1')
    preprocess_regexps = [
   (re.compile(r'<p>--Paul Thurrott.*</body>', re.DOTALL|re.IGNORECASE),
@@ -24,5 +23,5 @@ class Winsupersite(BasicNewsRecipe):
        br = BasicNewsRecipe.get_browser()
        br.open('http://www.winsupersite.com')
        return br
-    
+
    feeds          = [(u'Supersite for Windows', u'http://www.winsupersite.com/supersite.xml')]