feeds2lrf

This commit is contained in:
Kovid Goyal 2008-03-15 01:43:20 +00:00
parent fb53f18a8f
commit 00b0cf46fc
11 changed files with 206 additions and 70 deletions

View File

@ -18,7 +18,7 @@ __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500' __appname__ = 'libprs500'
import sys, os, logging, mechanize, locale, cStringIO, re, subprocess, textwrap import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, textwrap
from gettext import GNUTranslations from gettext import GNUTranslations
from math import floor from math import floor
from optparse import OptionParser as _OptionParser from optparse import OptionParser as _OptionParser
@ -143,38 +143,64 @@ class OptionParser(_OptionParser):
raise Exception(msg) raise Exception(msg)
_OptionParser.error(self, msg) _OptionParser.error(self, msg)
def merge(self, parser):
'''
Add options from parser to self. In case of conflicts, confilicting options from
parser are skipped.
'''
opts = list(parser.option_list)
groups = list(parser.option_groups)
def merge_options(options, container):
for opt in copy.deepcopy(options):
if not self.has_option(opt.get_opt_string()):
container.add_option(opt)
merge_options(opts, self)
for group in groups:
g = self.add_option_group(group.title)
merge_options(group.option_list, g)
def subsume(self, group_name, msg=''): def subsume(self, group_name, msg=''):
''' '''
Move all existing options into a subgroup named Move all existing options into a subgroup named
C{group_name} with description C{msg}. C{group_name} with description C{msg}.
''' '''
opts = list(self.option_list) opts = [opt for opt in self.options_iter() if opt.get_opt_string() not in ('--version', '--help')]
groups = list(self.option_groups)
exclude = []
for opt in opts:
ops = opt.get_opt_string()
if ops in ('--help', '--version'):
exclude.append(opt)
else:
self.remove_option(ops)
for group in groups:
for opt in group.option_list:
opts.append(opt)
group.remove_option(opt)
self.option_groups = [] self.option_groups = []
subgroup = self.add_option_group(group_name, msg) subgroup = self.add_option_group(group_name, msg)
for opt in opts: for opt in opts:
if opt in exclude: self.remove_option(opt.get_opt_string())
continue
subgroup.add_option(opt) subgroup.add_option(opt)
def options_iter(self):
for opt in self.option_list:
if str(opt).strip():
yield opt
for gr in self.option_groups:
for opt in gr.option_list:
if str(opt).strip():
yield opt
def option_by_dest(self, dest):
for opt in self.options_iter():
if opt.dest == dest:
return opt
def merge_options(self, lower, upper):
'''
Merge options in lower and upper option lists into upper.
Default values in upper are overriden by
non default values in lower.
'''
for dest in lower.__dict__.keys():
if not upper.__dict__.has_key(dest):
continue
opt = self.option_by_dest(dest)
if lower.__dict__[dest] != opt.default and \
upper.__dict__[dest] == opt.default:
upper.__dict__[dest] = lower.__dict__[dest]
def load_library(name, cdll): def load_library(name, cdll):

View File

@ -0,0 +1,16 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Convert web feeds to LRF files.
'''
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file
from libprs500.web.feeds.main import option_parser as feeds_option_parser
from libprs500.web.feeds.main import run_recipe
from libprs500.ptempfile import PersistentTemporaryDirectory
from libprs500 import sanitize_file_name
import sys, os, time
def option_parser():
parser = feeds_option_parser()
parser.remove_option('--output-dir')
parser.remove_option('--lrf')
parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
lrf_parser = lrf_option_parser('')
lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
parser.merge(lrf_parser)
return parser
def main(args=sys.argv, notification=None, handler=None):
parser = option_parser()
opts, args = parser.parse_args(args)
opts.lrf = True
if len(args) != 2 and opts.feeds is None:
parser.print_help()
return 1
recipe_arg = args[1] if len(args) > 1 else None
tdir = PersistentTemporaryDirectory('_feeds2lrf')
opts.output_dir = tdir
recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
htmlfile = os.path.join(tdir, 'index.html')
if not os.access(htmlfile, os.R_OK):
raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
lparser = lrf_option_parser('')
ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
parser.merge_options(ropts, opts)
if not opts.output:
ext = '.lrs' if opts.lrs else '.lrf'
fname = recipe.title + time.strftime(recipe.timefmt)+ext
opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
print 'Generating LRF...'
process_file(htmlfile, opts)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1715,7 +1715,7 @@ def process_file(path, options, logger=None):
tpath = '' tpath = ''
try_opf(path, options, logger) try_opf(path, options, logger)
if options.cover: if getattr(options, 'cover', None):
options.cover = os.path.expanduser(options.cover) options.cover = os.path.expanduser(options.cover)
if not os.path.isabs(options.cover): if not os.path.isabs(options.cover):
options.cover = os.path.join(dirpath, options.cover) options.cover = os.path.join(dirpath, options.cover)
@ -1750,7 +1750,7 @@ def process_file(path, options, logger=None):
options.title = default_title options.title = default_title
for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'): for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
val = getattr(options, prop) val = getattr(options, prop, None)
if val and not isinstance(val, unicode): if val and not isinstance(val, unicode):
soup = BeautifulSoup(val) soup = BeautifulSoup(val)
setattr(options, prop, unicode(soup)) setattr(options, prop, unicode(soup))
@ -1822,13 +1822,14 @@ def try_opf(path, options, logger):
break break
if opf is None: if opf is None:
return return
dirpath = os.path.dirname(os.path.abspath(opf)) dirpath = os.path.dirname(os.path.abspath(opf))
opf = OPFReader(open(opf, 'rb'), dirpath) opf = OPFReader(open(opf, 'rb'), dirpath)
try: try:
title = opf.title title = opf.title
if title and not options.title: if title and not getattr(options, 'title', None):
options.title = title options.title = title
if options.author == 'Unknown': if getattr(options, 'author', 'Unknown') == 'Unknown':
if opf.authors: if opf.authors:
options.author = ', '.join(opf.authors) options.author = ', '.join(opf.authors)
if opf.author_sort: if opf.author_sort:
@ -1837,12 +1838,12 @@ def try_opf(path, options, logger):
publisher = opf.publisher publisher = opf.publisher
if publisher: if publisher:
options.publisher = publisher options.publisher = publisher
if not options.category: if not getattr(options, 'category', None):
category = opf.category category = opf.category
if category: if category:
options.category = category options.category = category
if not options.cover or options.use_metadata_cover: if not getattr(options, 'cover', None) or options.use_metadata_cover:
orig_cover = options.cover orig_cover = getattr(options, 'cover', None)
options.cover = None options.cover = None
cover = opf.cover cover = opf.cover
if cover: if cover:
@ -1865,10 +1866,10 @@ def try_opf(path, options, logger):
break break
except: except:
continue continue
if not options.cover and orig_cover is not None: if not getattr(options, 'cover', None) and orig_cover is not None:
options.cover = orig_cover options.cover = orig_cover
options.spine = [i.href for i in opf.spine.items()] options.spine = [i.href for i in opf.spine.items()]
if not hasattr(options, 'toc') or options.toc is None: if not getattr(options, 'toc', None):
options.toc = opf.toc options.toc = opf.toc
except Exception: except Exception:
logger.exception('Failed to process opf file') logger.exception('Failed to process opf file')

View File

@ -503,7 +503,7 @@ class OPFReader(OPF):
stream.close() stream.close()
self.manifest = Manifest(self.soup, dir) self.manifest = Manifest(self.soup, dir)
self.spine = Spine(self.soup, self.manifest) self.spine = Spine(self.soup, self.manifest)
self.toc = TOC() self.toc = TOC(base_path=dir)
self.toc.read_from_opf(self) self.toc.read_from_opf(self)
self.cover_data = (None, None) self.cover_data = (None, None)
@ -554,12 +554,14 @@ class OPFCreator(MetaInformation):
self.manifest = rentries self.manifest = rentries
def create_manifest_from_files_in(self, files_and_dirs): def create_manifest_from_files_in(self, files_and_dirs):
#self.base_path = os.path.commonprefix(files_and_dirs)
entries = [] entries = []
def dodir(dir): def dodir(dir):
for root, dirs, files in os.walk(dir): for root, dirs, files in os.walk(dir):
for name in files: for name in files:
path = os.path.join(root, name) path = os.path.join(root, name)
if os.path.isfile(path):
entries.append((path, None)) entries.append((path, None))
for i in files_and_dirs: for i in files_and_dirs:

View File

@ -166,6 +166,7 @@ def setup_completion(fatal_errors):
from libprs500.ebooks.mobi.reader import option_parser as mobioeb from libprs500.ebooks.mobi.reader import option_parser as mobioeb
from libprs500.web.feeds.main import option_parser as feeds2disk from libprs500.web.feeds.main import option_parser as feeds2disk
from libprs500.web.feeds.recipes import titles as feed_titles from libprs500.web.feeds.recipes import titles as feed_titles
from libprs500.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
f = open_file('/etc/bash_completion.d/libprs500') f = open_file('/etc/bash_completion.d/libprs500')
@ -191,6 +192,7 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf'])) f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc'])) f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles)) f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
f.write(''' f.write('''
_prs500_ls() _prs500_ls()
{ {

View File

@ -79,7 +79,7 @@ class Feed(object):
self.oldest_article = oldest_article self.oldest_article = oldest_article
for item in entries: for item in entries:
if len(self.articles) > max_articles_per_feed: if len(self.articles) >= max_articles_per_feed:
break break
self.parse_article(item) self.parse_article(item)

View File

@ -41,7 +41,7 @@ Available builtin recipes are:
p.remove_option('--base-dir') p.remove_option('--base-dir')
p.remove_option('--verbose') p.remove_option('--verbose')
p.remove_option('--max-files') p.remove_option('--max-files')
p.subsume('WEB2DISK OPTIONS', 'Options to control web2disk (used to fetch websites linked from feeds)') p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
p.add_option('--feeds', default=None, p.add_option('--feeds', default=None,
help=_('''Specify a list of feeds to download. For example: help=_('''Specify a list of feeds to download. For example:
@ -50,7 +50,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
p.add_option('--verbose', default=False, action='store_true', p.add_option('--verbose', default=False, action='store_true',
help=_('''Be more verbose while processing.''')) help=_('''Be more verbose while processing.'''))
p.add_option('--title', default=None, p.add_option('--title', default=None,
help='The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.') help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.')) p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.'))
p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.')) p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.'))
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.') p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
@ -61,7 +61,9 @@ If you specify this option, any argument to %prog is ignored and a default recip
p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false', p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
help=_('Dont show the progress bar')) help=_('Dont show the progress bar'))
p.add_option('--debug', action='store_true', default=False, p.add_option('--debug', action='store_true', default=False,
help='Very verbose output, useful for debugging.') help=_('Very verbose output, useful for debugging.'))
p.add_option('--test', action='store_true', default=False,
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
return p return p
@ -72,10 +74,10 @@ def simple_progress_bar(percent, msg):
def no_progress_bar(percent, msg): def no_progress_bar(percent, msg):
print msg print msg
def main(args=sys.argv, notification=None, handler=None): class RecipeError(Exception):
p = option_parser() pass
opts, args = p.parse_args(args)
def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
if notification is None: if notification is None:
from libprs500.terminfo import TerminalController, ProgressBar from libprs500.terminfo import TerminalController, ProgressBar
term = TerminalController(sys.stdout) term = TerminalController(sys.stdout)
@ -89,18 +91,15 @@ def main(args=sys.argv, notification=None, handler=None):
else: else:
notification = no_progress_bar notification = no_progress_bar
if len(args) != 2 and opts.feeds is None:
p.print_help()
return 1
recipe = None recipe = None
if opts.feeds is not None: if opts.feeds is not None:
recipe = BasicNewsRecipe recipe = BasicNewsRecipe
else: else:
try: try:
if os.access(args[1], os.R_OK): if os.access(recipe_arg, os.R_OK):
try: try:
recipe = compile_recipe(open(args[1]).read()) recipe = compile_recipe(open(recipe_arg).read())
except: except:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
@ -108,15 +107,13 @@ def main(args=sys.argv, notification=None, handler=None):
else: else:
raise Exception('not file') raise Exception('not file')
except: except:
recipe = get_builtin_recipe(args[1]) recipe = get_builtin_recipe(recipe_arg)
if recipe is None: if recipe is None:
recipe = compile_recipe(args[1]) recipe = compile_recipe(recipe_arg)
if recipe is None: if recipe is None:
p.print_help() raise RecipeError(recipe_arg+ ' is an invalid recipe')
print
print args[1], 'is an invalid recipe'
return 1
if handler is None: if handler is None:
from libprs500 import ColoredFormatter from libprs500 import ColoredFormatter
@ -125,9 +122,23 @@ def main(args=sys.argv, notification=None, handler=None):
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
logging.getLogger('feeds2disk').addHandler(handler) logging.getLogger('feeds2disk').addHandler(handler)
recipe = recipe(opts, p, notification) recipe = recipe(opts, parser, notification)
if not os.path.exists(recipe.output_dir):
os.makedirs(recipe.output_dir)
recipe.download() recipe.download()
return recipe
def main(args=sys.argv, notification=None, handler=None):
p = option_parser()
opts, args = p.parse_args(args)
if len(args) != 2 and opts.feeds is None:
p.print_help()
return 1
recipe_arg = args[1] if len(args) > 1 else None
run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -165,6 +165,8 @@ class BasicNewsRecipe(object):
''' '''
if not self.feeds: if not self.feeds:
raise NotImplementedError raise NotImplementedError
if self.test:
return self.feeds[:2]
return self.feeds return self.feeds
@classmethod @classmethod
@ -225,9 +227,12 @@ class BasicNewsRecipe(object):
@param parser: Command line option parser. Used to intelligently merge options. @param parser: Command line option parser. Used to intelligently merge options.
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. @param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
''' '''
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'): for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
setattr(self, attr, getattr(options, attr)) setattr(self, attr, getattr(options, attr))
self.output_dir = os.path.abspath(self.output_dir) self.output_dir = os.path.abspath(self.output_dir)
if options.test:
self.max_articles_per_feed = 2
self.simultaneous_downloads = min(4, self.simultaneous_downloads)
self.logger = logging.getLogger('feeds2disk') self.logger = logging.getLogger('feeds2disk')
@ -288,11 +293,13 @@ class BasicNewsRecipe(object):
self.simultaneous_downloads = 1 self.simultaneous_downloads = 1
self.navbar = templates.NavBarTemplate() self.navbar = templates.NavBarTemplate()
self.max_articles_per_feed -= 1
self.html2lrf_options.append('--use-spine') self.html2lrf_options.append('--use-spine')
self.failed_downloads = [] self.failed_downloads = []
self.partial_failures = [] self.partial_failures = []
def _postprocess_html(self, soup): def _postprocess_html(self, soup):
if self.extra_css is not None: if self.extra_css is not None:
head = soup.find('head') head = soup.find('head')
@ -383,6 +390,8 @@ class BasicNewsRecipe(object):
def build_index(self): def build_index(self):
self.report_progress(0, _('Fetching feeds...')) self.report_progress(0, _('Fetching feeds...'))
feeds = self.parse_feeds() feeds = self.parse_feeds()
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1 self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html') index = os.path.join(self.output_dir, 'index.html')
@ -460,13 +469,14 @@ class BasicNewsRecipe(object):
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__]) mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
mi.author_sort = __appname__
opf_path = os.path.join(dir, 'index.opf') opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx') ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi) opf = OPFCreator(dir, mi)
manifest = ['feed_%d'%i for i in range(len(feeds))] manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append('index.html') manifest.append(os.path.join(dir, 'index.html'))
cpath = getattr(self, 'cover_path', None) cpath = getattr(self, 'cover_path', None)
if cpath is not None and os.access(cpath, os.R_OK): if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath opf.cover = cpath

View File

@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen', 'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
] ]
extra_css = '#content { font:serif 1.2em; }' extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}'
keep_only_tags = [dict(name='div', id='content')] keep_only_tags = [dict(name='div', id='content')]
remove_tags = [ remove_tags = [
@ -54,11 +54,6 @@ class Newsweek(BasicNewsRecipe):
recursions = 1 recursions = 1
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
# For testing
#feeds = feeds[3:5]
#max_articles_per_feed = 2
def postprocess_html(self, soup): def postprocess_html(self, soup):
divs = list(soup.findAll('div', 'pagination')) divs = list(soup.findAll('div', 'pagination'))

View File

@ -37,9 +37,11 @@ def basename(url):
return res return res
def save_soup(soup, target): def save_soup(soup, target):
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
meta = soup.find('meta', content=True) nm = ns.find('meta')
if meta and 'charset' in meta['content']: metas = soup.findAll('meta', content=True)
for meta in metas:
if 'charset' in meta['content']:
meta.replaceWith(nm) meta.replaceWith(nm)
f = codecs.open(target, 'w', 'utf-8') f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup)) f.write(unicode(soup))