feeds2lrf

This commit is contained in:
Kovid Goyal 2008-03-15 01:43:20 +00:00
parent fb53f18a8f
commit 00b0cf46fc
11 changed files with 206 additions and 70 deletions

View File

@ -18,7 +18,7 @@ __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500'
import sys, os, logging, mechanize, locale, cStringIO, re, subprocess, textwrap
import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, textwrap
from gettext import GNUTranslations
from math import floor
from optparse import OptionParser as _OptionParser
@ -143,38 +143,64 @@ class OptionParser(_OptionParser):
raise Exception(msg)
_OptionParser.error(self, msg)
def merge(self, parser):
'''
Add options from parser to self. In case of conflicts, confilicting options from
parser are skipped.
'''
opts = list(parser.option_list)
groups = list(parser.option_groups)
def merge_options(options, container):
for opt in copy.deepcopy(options):
if not self.has_option(opt.get_opt_string()):
container.add_option(opt)
merge_options(opts, self)
for group in groups:
g = self.add_option_group(group.title)
merge_options(group.option_list, g)
def subsume(self, group_name, msg=''):
'''
Move all existing options into a subgroup named
C{group_name} with description C{msg}.
'''
opts = list(self.option_list)
groups = list(self.option_groups)
exclude = []
for opt in opts:
ops = opt.get_opt_string()
if ops in ('--help', '--version'):
exclude.append(opt)
else:
self.remove_option(ops)
for group in groups:
for opt in group.option_list:
opts.append(opt)
group.remove_option(opt)
opts = [opt for opt in self.options_iter() if opt.get_opt_string() not in ('--version', '--help')]
self.option_groups = []
subgroup = self.add_option_group(group_name, msg)
for opt in opts:
if opt in exclude:
continue
self.remove_option(opt.get_opt_string())
subgroup.add_option(opt)
def options_iter(self):
for opt in self.option_list:
if str(opt).strip():
yield opt
for gr in self.option_groups:
for opt in gr.option_list:
if str(opt).strip():
yield opt
def option_by_dest(self, dest):
for opt in self.options_iter():
if opt.dest == dest:
return opt
def merge_options(self, lower, upper):
'''
Merge options in lower and upper option lists into upper.
Default values in upper are overriden by
non default values in lower.
'''
for dest in lower.__dict__.keys():
if not upper.__dict__.has_key(dest):
continue
opt = self.option_by_dest(dest)
if lower.__dict__[dest] != opt.default and \
upper.__dict__[dest] == opt.default:
upper.__dict__[dest] = lower.__dict__[dest]
def load_library(name, cdll):

View File

@ -0,0 +1,16 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Convert web feeds to LRF files.
'''
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file
from libprs500.web.feeds.main import option_parser as feeds_option_parser
from libprs500.web.feeds.main import run_recipe
from libprs500.ptempfile import PersistentTemporaryDirectory
from libprs500 import sanitize_file_name
import sys, os, time
def option_parser():
parser = feeds_option_parser()
parser.remove_option('--output-dir')
parser.remove_option('--lrf')
parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
lrf_parser = lrf_option_parser('')
lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
parser.merge(lrf_parser)
return parser
def main(args=sys.argv, notification=None, handler=None):
parser = option_parser()
opts, args = parser.parse_args(args)
opts.lrf = True
if len(args) != 2 and opts.feeds is None:
parser.print_help()
return 1
recipe_arg = args[1] if len(args) > 1 else None
tdir = PersistentTemporaryDirectory('_feeds2lrf')
opts.output_dir = tdir
recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
htmlfile = os.path.join(tdir, 'index.html')
if not os.access(htmlfile, os.R_OK):
raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
lparser = lrf_option_parser('')
ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
parser.merge_options(ropts, opts)
if not opts.output:
ext = '.lrs' if opts.lrs else '.lrf'
fname = recipe.title + time.strftime(recipe.timefmt)+ext
opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
print 'Generating LRF...'
process_file(htmlfile, opts)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1715,7 +1715,7 @@ def process_file(path, options, logger=None):
tpath = ''
try_opf(path, options, logger)
if options.cover:
if getattr(options, 'cover', None):
options.cover = os.path.expanduser(options.cover)
if not os.path.isabs(options.cover):
options.cover = os.path.join(dirpath, options.cover)
@ -1750,7 +1750,7 @@ def process_file(path, options, logger=None):
options.title = default_title
for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
val = getattr(options, prop)
val = getattr(options, prop, None)
if val and not isinstance(val, unicode):
soup = BeautifulSoup(val)
setattr(options, prop, unicode(soup))
@ -1822,13 +1822,14 @@ def try_opf(path, options, logger):
break
if opf is None:
return
dirpath = os.path.dirname(os.path.abspath(opf))
opf = OPFReader(open(opf, 'rb'), dirpath)
try:
title = opf.title
if title and not options.title:
if title and not getattr(options, 'title', None):
options.title = title
if options.author == 'Unknown':
if getattr(options, 'author', 'Unknown') == 'Unknown':
if opf.authors:
options.author = ', '.join(opf.authors)
if opf.author_sort:
@ -1837,12 +1838,12 @@ def try_opf(path, options, logger):
publisher = opf.publisher
if publisher:
options.publisher = publisher
if not options.category:
if not getattr(options, 'category', None):
category = opf.category
if category:
options.category = category
if not options.cover or options.use_metadata_cover:
orig_cover = options.cover
if not getattr(options, 'cover', None) or options.use_metadata_cover:
orig_cover = getattr(options, 'cover', None)
options.cover = None
cover = opf.cover
if cover:
@ -1865,10 +1866,10 @@ def try_opf(path, options, logger):
break
except:
continue
if not options.cover and orig_cover is not None:
if not getattr(options, 'cover', None) and orig_cover is not None:
options.cover = orig_cover
options.spine = [i.href for i in opf.spine.items()]
if not hasattr(options, 'toc') or options.toc is None:
if not getattr(options, 'toc', None):
options.toc = opf.toc
except Exception:
logger.exception('Failed to process opf file')

View File

@ -503,7 +503,7 @@ class OPFReader(OPF):
stream.close()
self.manifest = Manifest(self.soup, dir)
self.spine = Spine(self.soup, self.manifest)
self.toc = TOC()
self.toc = TOC(base_path=dir)
self.toc.read_from_opf(self)
self.cover_data = (None, None)
@ -554,12 +554,14 @@ class OPFCreator(MetaInformation):
self.manifest = rentries
def create_manifest_from_files_in(self, files_and_dirs):
#self.base_path = os.path.commonprefix(files_and_dirs)
entries = []
def dodir(dir):
for root, dirs, files in os.walk(dir):
for name in files:
path = os.path.join(root, name)
if os.path.isfile(path):
entries.append((path, None))
for i in files_and_dirs:

View File

@ -166,6 +166,7 @@ def setup_completion(fatal_errors):
from libprs500.ebooks.mobi.reader import option_parser as mobioeb
from libprs500.web.feeds.main import option_parser as feeds2disk
from libprs500.web.feeds.recipes import titles as feed_titles
from libprs500.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
f = open_file('/etc/bash_completion.d/libprs500')
@ -191,6 +192,7 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
f.write('''
_prs500_ls()
{

View File

@ -79,7 +79,7 @@ class Feed(object):
self.oldest_article = oldest_article
for item in entries:
if len(self.articles) > max_articles_per_feed:
if len(self.articles) >= max_articles_per_feed:
break
self.parse_article(item)

View File

@ -41,7 +41,7 @@ Available builtin recipes are:
p.remove_option('--base-dir')
p.remove_option('--verbose')
p.remove_option('--max-files')
p.subsume('WEB2DISK OPTIONS', 'Options to control web2disk (used to fetch websites linked from feeds)')
p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
p.add_option('--feeds', default=None,
help=_('''Specify a list of feeds to download. For example:
@ -50,7 +50,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
p.add_option('--verbose', default=False, action='store_true',
help=_('''Be more verbose while processing.'''))
p.add_option('--title', default=None,
help='The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')
help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.'))
p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.'))
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
@ -61,7 +61,9 @@ If you specify this option, any argument to %prog is ignored and a default recip
p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
help=_('Dont show the progress bar'))
p.add_option('--debug', action='store_true', default=False,
help='Very verbose output, useful for debugging.')
help=_('Very verbose output, useful for debugging.'))
p.add_option('--test', action='store_true', default=False,
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
return p
@ -72,10 +74,10 @@ def simple_progress_bar(percent, msg):
def no_progress_bar(percent, msg):
print msg
def main(args=sys.argv, notification=None, handler=None):
p = option_parser()
opts, args = p.parse_args(args)
class RecipeError(Exception):
pass
def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
if notification is None:
from libprs500.terminfo import TerminalController, ProgressBar
term = TerminalController(sys.stdout)
@ -89,18 +91,15 @@ def main(args=sys.argv, notification=None, handler=None):
else:
notification = no_progress_bar
if len(args) != 2 and opts.feeds is None:
p.print_help()
return 1
recipe = None
if opts.feeds is not None:
recipe = BasicNewsRecipe
else:
try:
if os.access(args[1], os.R_OK):
if os.access(recipe_arg, os.R_OK):
try:
recipe = compile_recipe(open(args[1]).read())
recipe = compile_recipe(open(recipe_arg).read())
except:
import traceback
traceback.print_exc()
@ -108,15 +107,13 @@ def main(args=sys.argv, notification=None, handler=None):
else:
raise Exception('not file')
except:
recipe = get_builtin_recipe(args[1])
recipe = get_builtin_recipe(recipe_arg)
if recipe is None:
recipe = compile_recipe(args[1])
recipe = compile_recipe(recipe_arg)
if recipe is None:
p.print_help()
print
print args[1], 'is an invalid recipe'
return 1
raise RecipeError(recipe_arg+ ' is an invalid recipe')
if handler is None:
from libprs500 import ColoredFormatter
@ -125,9 +122,23 @@ def main(args=sys.argv, notification=None, handler=None):
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
logging.getLogger('feeds2disk').addHandler(handler)
recipe = recipe(opts, p, notification)
recipe = recipe(opts, parser, notification)
if not os.path.exists(recipe.output_dir):
os.makedirs(recipe.output_dir)
recipe.download()
return recipe
def main(args=sys.argv, notification=None, handler=None):
p = option_parser()
opts, args = p.parse_args(args)
if len(args) != 2 and opts.feeds is None:
p.print_help()
return 1
recipe_arg = args[1] if len(args) > 1 else None
run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)
return 0
if __name__ == '__main__':

View File

@ -165,6 +165,8 @@ class BasicNewsRecipe(object):
'''
if not self.feeds:
raise NotImplementedError
if self.test:
return self.feeds[:2]
return self.feeds
@classmethod
@ -225,9 +227,12 @@ class BasicNewsRecipe(object):
@param parser: Command line option parser. Used to intelligently merge options.
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
'''
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'):
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
setattr(self, attr, getattr(options, attr))
self.output_dir = os.path.abspath(self.output_dir)
if options.test:
self.max_articles_per_feed = 2
self.simultaneous_downloads = min(4, self.simultaneous_downloads)
self.logger = logging.getLogger('feeds2disk')
@ -288,11 +293,13 @@ class BasicNewsRecipe(object):
self.simultaneous_downloads = 1
self.navbar = templates.NavBarTemplate()
self.max_articles_per_feed -= 1
self.html2lrf_options.append('--use-spine')
self.failed_downloads = []
self.partial_failures = []
def _postprocess_html(self, soup):
if self.extra_css is not None:
head = soup.find('head')
@ -383,6 +390,8 @@ class BasicNewsRecipe(object):
def build_index(self):
self.report_progress(0, _('Fetching feeds...'))
feeds = self.parse_feeds()
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
@ -460,13 +469,14 @@ class BasicNewsRecipe(object):
if dir is None:
dir = self.output_dir
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
mi.author_sort = __appname__
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
manifest = ['feed_%d'%i for i in range(len(feeds))]
manifest.append('index.html')
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
cpath = getattr(self, 'cover_path', None)
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath

View File

@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
]
extra_css = '#content { font:serif 1.2em; }'
extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}'
keep_only_tags = [dict(name='div', id='content')]
remove_tags = [
@ -54,11 +54,6 @@ class Newsweek(BasicNewsRecipe):
recursions = 1
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
# For testing
#feeds = feeds[3:5]
#max_articles_per_feed = 2
def postprocess_html(self, soup):
divs = list(soup.findAll('div', 'pagination'))

View File

@ -37,9 +37,11 @@ def basename(url):
return res
def save_soup(soup, target):
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
meta = soup.find('meta', content=True)
if meta and 'charset' in meta['content']:
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
nm = ns.find('meta')
metas = soup.findAll('meta', content=True)
for meta in metas:
if 'charset' in meta['content']:
meta.replaceWith(nm)
f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup))