Allow recipes to specify overrides for conversion options

This commit is contained in:
Kovid Goyal 2009-05-13 19:20:47 -07:00
parent 9b170e6c95
commit 2e0ad5d1e0
5 changed files with 74 additions and 68 deletions

View File

@ -54,6 +54,8 @@ Customizing e-book download
.. automember:: BasicNewsRecipe.timefmt .. automember:: BasicNewsRecipe.timefmt
.. automember:: basicNewsRecipe.conversion_options
.. automember:: BasicNewsRecipe.feeds .. automember:: BasicNewsRecipe.feeds
.. automember:: BasicNewsRecipe.no_stylesheets .. automember:: BasicNewsRecipe.no_stylesheets

View File

@ -57,6 +57,8 @@ class RecipeInput(InputFormatPlugin):
ro = recipe(opts, log, self.report_progress) ro = recipe(opts, log, self.report_progress)
ro.download() ro.download()
for key, val in recipe.conversion_options.items():
setattr(opts, key, val)
opts.output_profile.flow_size = 0 opts.output_profile.flow_size = 0

View File

@ -156,13 +156,16 @@ class BasicNewsRecipe(Recipe):
#: :attr:`BasicNewsRecipe.filter_regexps` should be defined. #: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
filter_regexps = [] filter_regexps = []
#: List of options to pass to html2lrf, to customize generation of LRF ebooks. #: Recipe specific options to control the conversion of the downloaded
html2lrf_options = [] #: content into an e-book. These will override any user or plugin specified
#: values, so only use if absolutely necessary. For example:
#: Options to pass to html2epub to customize generation of EPUB ebooks. #: conversion_options = {
html2epub_options = '' #: 'base_font_size' : 16,
#: Options to pass to oeb2mobi to customize generation of MOBI ebooks. #: 'tags' : 'mytag1,mytag2',
oeb2mobi_options = '' #: 'title' : 'My Title',
#: 'linearize_tables' : True,
#: }
conversion_options = {}
#: List of tags to be removed. Specified tags are removed from downloaded HTML. #: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified as a dictionary of the form:: #: A tag is specified as a dictionary of the form::

View File

@ -1,76 +1,76 @@
## ##
## web2lrf profile to download articles from Barrons.com ## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and ## can download subscriber-only content if username and
## password are supplied. ## password are supplied.
## ##
''' '''
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Barrons(BasicNewsRecipe): class Barrons(BasicNewsRecipe):
title = 'Barron\'s' title = 'Barron\'s'
max_articles_per_feed = 50 max_articles_per_feed = 50
needs_subscription = True needs_subscription = True
language = _('English') language = _('English')
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
description = 'Weekly publication for investors from the publisher of the Wall Street Journal' description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
use_embedded_content = False use_embedded_content = False
no_stylesheets = False no_stylesheets = False
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*'] match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
html2lrf_options = [('--ignore-tables'),('--base-font-size=10')] conversion_options = {'linearize_tables': True}
##delay = 1 ##delay = 1
## Don't grab articles more than 7 days old ## Don't grab articles more than 7 days old
oldest_article = 7 oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ [
## Remove anything before the body of the article. ## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove any reprint info from the body of the article. ## Remove any insets from the body of the article.
(r'<hr size.*?<p', lambda match : '<p'), (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove anything after the end of the article. ## Remove any reprint info from the body of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'), (r'<hr size.*?<p', lambda match : '<p'),
]
] ## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
def get_browser(self): ]
br = BasicNewsRecipe.get_browser() ]
if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login') def get_browser(self):
br.select_form(name='login_form') br = BasicNewsRecipe.get_browser()
br['user'] = self.username if self.username is not None and self.password is not None:
br['password'] = self.password br.open('http://commerce.barrons.com/auth/login')
br.submit() br.select_form(name='login_form')
return br br['user'] = self.username
br['password'] = self.password
## Use the print version of a page when available. br.submit()
return br
def print_version(self, url):
return url.replace('/article/', '/article_print/') ## Use the print version of a page when available.
## Comment out the feeds you don't want retrieved. def print_version(self, url):
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire return url.replace('/article/', '/article_print/')
def get_feeds(self): ## Comment out the feeds you don't want retrieved.
return [ ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'), def get_feeds(self):
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'), return [
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'), ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'), ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
] ]
## Logout of website ## Logout of website

View File

@ -13,8 +13,7 @@ class Winsupersite(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
remove_javascript = True remove_javascript = True
html2lrf_options = ['--ignore-tables'] conversion_options = {'linearize_tables' : True}
html2epub_options = 'linearize_tables = True'
remove_tags_before = dict(name='h1') remove_tags_before = dict(name='h1')
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<p>--Paul Thurrott.*</body>', re.DOTALL|re.IGNORECASE), (re.compile(r'<p>--Paul Thurrott.*</body>', re.DOTALL|re.IGNORECASE),
@ -24,5 +23,5 @@ class Winsupersite(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.open('http://www.winsupersite.com') br.open('http://www.winsupersite.com')
return br return br
feeds = [(u'Supersite for Windows', u'http://www.winsupersite.com/supersite.xml')] feeds = [(u'Supersite for Windows', u'http://www.winsupersite.com/supersite.xml')]