From 00b0cf46fc298d1ed95dd1774e6dbbbc947f85b7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 15 Mar 2008 01:43:20 +0000 Subject: [PATCH] feeds2lrf --- src/libprs500/__init__.py | 72 +++++++++++++------ src/libprs500/ebooks/lrf/feeds/__init__.py | 16 +++++ .../ebooks/lrf/feeds/convert_from.py | 71 ++++++++++++++++++ src/libprs500/ebooks/lrf/html/convert_from.py | 19 ++--- src/libprs500/ebooks/metadata/opf.py | 6 +- src/libprs500/linux.py | 2 + src/libprs500/web/feeds/__init__.py | 2 +- src/libprs500/web/feeds/main.py | 51 +++++++------ src/libprs500/web/feeds/news.py | 20 ++++-- src/libprs500/web/feeds/recipes/newsweek.py | 7 +- src/libprs500/web/fetch/simple.py | 10 +-- 11 files changed, 206 insertions(+), 70 deletions(-) create mode 100644 src/libprs500/ebooks/lrf/feeds/__init__.py create mode 100644 src/libprs500/ebooks/lrf/feeds/convert_from.py diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index 0374dc3adc..f1547982d4 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -18,7 +18,7 @@ __docformat__ = "epytext" __author__ = "Kovid Goyal " __appname__ = 'libprs500' -import sys, os, logging, mechanize, locale, cStringIO, re, subprocess, textwrap +import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, textwrap from gettext import GNUTranslations from math import floor from optparse import OptionParser as _OptionParser @@ -143,38 +143,64 @@ class OptionParser(_OptionParser): raise Exception(msg) _OptionParser.error(self, msg) + def merge(self, parser): + ''' + Add options from parser to self. In case of conflicts, confilicting options from + parser are skipped. + ''' + opts = list(parser.option_list) + groups = list(parser.option_groups) + + def merge_options(options, container): + for opt in copy.deepcopy(options): + if not self.has_option(opt.get_opt_string()): + container.add_option(opt) + + merge_options(opts, self) + + for group in groups: + g = self.add_option_group(group.title) + merge_options(group.option_list, g) + def subsume(self, group_name, msg=''): ''' Move all existing options into a subgroup named C{group_name} with description C{msg}. ''' - opts = list(self.option_list) - groups = list(self.option_groups) - exclude = [] - - for opt in opts: - ops = opt.get_opt_string() - if ops in ('--help', '--version'): - exclude.append(opt) - else: - self.remove_option(ops) - for group in groups: - for opt in group.option_list: - opts.append(opt) - group.remove_option(opt) - + opts = [opt for opt in self.options_iter() if opt.get_opt_string() not in ('--version', '--help')] self.option_groups = [] subgroup = self.add_option_group(group_name, msg) for opt in opts: - if opt in exclude: - continue + self.remove_option(opt.get_opt_string()) subgroup.add_option(opt) - - - - - + def options_iter(self): + for opt in self.option_list: + if str(opt).strip(): + yield opt + for gr in self.option_groups: + for opt in gr.option_list: + if str(opt).strip(): + yield opt + + def option_by_dest(self, dest): + for opt in self.options_iter(): + if opt.dest == dest: + return opt + + def merge_options(self, lower, upper): + ''' + Merge options in lower and upper option lists into upper. + Default values in upper are overriden by + non default values in lower. + ''' + for dest in lower.__dict__.keys(): + if not upper.__dict__.has_key(dest): + continue + opt = self.option_by_dest(dest) + if lower.__dict__[dest] != opt.default and \ + upper.__dict__[dest] == opt.default: + upper.__dict__[dest] = lower.__dict__[dest] def load_library(name, cdll): diff --git a/src/libprs500/ebooks/lrf/feeds/__init__.py b/src/libprs500/ebooks/lrf/feeds/__init__.py new file mode 100644 index 0000000000..86475d028f --- /dev/null +++ b/src/libprs500/ebooks/lrf/feeds/__init__.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. diff --git a/src/libprs500/ebooks/lrf/feeds/convert_from.py b/src/libprs500/ebooks/lrf/feeds/convert_from.py new file mode 100644 index 0000000000..b55e20d3bc --- /dev/null +++ b/src/libprs500/ebooks/lrf/feeds/convert_from.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Convert web feeds to LRF files. +''' +from libprs500.ebooks.lrf import option_parser as lrf_option_parser +from libprs500.ebooks.lrf.html.convert_from import process_file +from libprs500.web.feeds.main import option_parser as feeds_option_parser +from libprs500.web.feeds.main import run_recipe +from libprs500.ptempfile import PersistentTemporaryDirectory +from libprs500 import sanitize_file_name + +import sys, os, time + +def option_parser(): + parser = feeds_option_parser() + parser.remove_option('--output-dir') + parser.remove_option('--lrf') + parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk')) + lrf_parser = lrf_option_parser('') + lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf')) + parser.merge(lrf_parser) + return parser + +def main(args=sys.argv, notification=None, handler=None): + parser = option_parser() + opts, args = parser.parse_args(args) + opts.lrf = True + + if len(args) != 2 and opts.feeds is None: + parser.print_help() + return 1 + + recipe_arg = args[1] if len(args) > 1 else None + + tdir = PersistentTemporaryDirectory('_feeds2lrf') + opts.output_dir = tdir + + recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler) + + htmlfile = os.path.join(tdir, 'index.html') + if not os.access(htmlfile, os.R_OK): + raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg) + + lparser = lrf_option_parser('') + ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0] + parser.merge_options(ropts, opts) + + if not opts.output: + ext = '.lrs' if opts.lrs else '.lrf' + fname = recipe.title + time.strftime(recipe.timefmt)+ext + opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname)) + print 'Generating LRF...' + process_file(htmlfile, opts) + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 6ec3f06c53..0e95ca0e81 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -1715,7 +1715,7 @@ def process_file(path, options, logger=None): tpath = '' try_opf(path, options, logger) - if options.cover: + if getattr(options, 'cover', None): options.cover = os.path.expanduser(options.cover) if not os.path.isabs(options.cover): options.cover = os.path.join(dirpath, options.cover) @@ -1750,7 +1750,7 @@ def process_file(path, options, logger=None): options.title = default_title for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'): - val = getattr(options, prop) + val = getattr(options, prop, None) if val and not isinstance(val, unicode): soup = BeautifulSoup(val) setattr(options, prop, unicode(soup)) @@ -1822,13 +1822,14 @@ def try_opf(path, options, logger): break if opf is None: return + dirpath = os.path.dirname(os.path.abspath(opf)) opf = OPFReader(open(opf, 'rb'), dirpath) try: title = opf.title - if title and not options.title: + if title and not getattr(options, 'title', None): options.title = title - if options.author == 'Unknown': + if getattr(options, 'author', 'Unknown') == 'Unknown': if opf.authors: options.author = ', '.join(opf.authors) if opf.author_sort: @@ -1837,12 +1838,12 @@ def try_opf(path, options, logger): publisher = opf.publisher if publisher: options.publisher = publisher - if not options.category: + if not getattr(options, 'category', None): category = opf.category if category: options.category = category - if not options.cover or options.use_metadata_cover: - orig_cover = options.cover + if not getattr(options, 'cover', None) or options.use_metadata_cover: + orig_cover = getattr(options, 'cover', None) options.cover = None cover = opf.cover if cover: @@ -1865,10 +1866,10 @@ def try_opf(path, options, logger): break except: continue - if not options.cover and orig_cover is not None: + if not getattr(options, 'cover', None) and orig_cover is not None: options.cover = orig_cover options.spine = [i.href for i in opf.spine.items()] - if not hasattr(options, 'toc') or options.toc is None: + if not getattr(options, 'toc', None): options.toc = opf.toc except Exception: logger.exception('Failed to process opf file') diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py index c4b627cf71..73d88f7145 100644 --- a/src/libprs500/ebooks/metadata/opf.py +++ b/src/libprs500/ebooks/metadata/opf.py @@ -503,7 +503,7 @@ class OPFReader(OPF): stream.close() self.manifest = Manifest(self.soup, dir) self.spine = Spine(self.soup, self.manifest) - self.toc = TOC() + self.toc = TOC(base_path=dir) self.toc.read_from_opf(self) self.cover_data = (None, None) @@ -554,13 +554,15 @@ class OPFCreator(MetaInformation): self.manifest = rentries def create_manifest_from_files_in(self, files_and_dirs): + #self.base_path = os.path.commonprefix(files_and_dirs) entries = [] def dodir(dir): for root, dirs, files in os.walk(dir): for name in files: path = os.path.join(root, name) - entries.append((path, None)) + if os.path.isfile(path): + entries.append((path, None)) for i in files_and_dirs: if os.path.isdir(i): diff --git a/src/libprs500/linux.py b/src/libprs500/linux.py index de3846f4a5..d883434d6a 100644 --- a/src/libprs500/linux.py +++ b/src/libprs500/linux.py @@ -166,6 +166,7 @@ def setup_completion(fatal_errors): from libprs500.ebooks.mobi.reader import option_parser as mobioeb from libprs500.web.feeds.main import option_parser as feeds2disk from libprs500.web.feeds.recipes import titles as feed_titles + from libprs500.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf f = open_file('/etc/bash_completion.d/libprs500') @@ -191,6 +192,7 @@ def setup_completion(fatal_errors): f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf'])) f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc'])) f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles)) + f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles)) f.write(''' _prs500_ls() { diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py index 1a3083131d..fb551febd4 100644 --- a/src/libprs500/web/feeds/__init__.py +++ b/src/libprs500/web/feeds/__init__.py @@ -79,7 +79,7 @@ class Feed(object): self.oldest_article = oldest_article for item in entries: - if len(self.articles) > max_articles_per_feed: + if len(self.articles) >= max_articles_per_feed: break self.parse_article(item) diff --git a/src/libprs500/web/feeds/main.py b/src/libprs500/web/feeds/main.py index 8e2d7d0701..9c34614e28 100644 --- a/src/libprs500/web/feeds/main.py +++ b/src/libprs500/web/feeds/main.py @@ -41,7 +41,7 @@ Available builtin recipes are: p.remove_option('--base-dir') p.remove_option('--verbose') p.remove_option('--max-files') - p.subsume('WEB2DISK OPTIONS', 'Options to control web2disk (used to fetch websites linked from feeds)') + p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)')) p.add_option('--feeds', default=None, help=_('''Specify a list of feeds to download. For example: @@ -50,7 +50,7 @@ If you specify this option, any argument to %prog is ignored and a default recip p.add_option('--verbose', default=False, action='store_true', help=_('''Be more verbose while processing.''')) p.add_option('--title', default=None, - help='The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.') + help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')) p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.')) p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.')) p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.') @@ -61,7 +61,9 @@ If you specify this option, any argument to %prog is ignored and a default recip p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false', help=_('Dont show the progress bar')) p.add_option('--debug', action='store_true', default=False, - help='Very verbose output, useful for debugging.') + help=_('Very verbose output, useful for debugging.')) + p.add_option('--test', action='store_true', default=False, + help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.')) return p @@ -72,10 +74,10 @@ def simple_progress_bar(percent, msg): def no_progress_bar(percent, msg): print msg -def main(args=sys.argv, notification=None, handler=None): - p = option_parser() - opts, args = p.parse_args(args) - +class RecipeError(Exception): + pass + +def run_recipe(opts, recipe_arg, parser, notification=None, handler=None): if notification is None: from libprs500.terminfo import TerminalController, ProgressBar term = TerminalController(sys.stdout) @@ -89,18 +91,15 @@ def main(args=sys.argv, notification=None, handler=None): else: notification = no_progress_bar - if len(args) != 2 and opts.feeds is None: - p.print_help() - return 1 recipe = None if opts.feeds is not None: recipe = BasicNewsRecipe else: try: - if os.access(args[1], os.R_OK): + if os.access(recipe_arg, os.R_OK): try: - recipe = compile_recipe(open(args[1]).read()) + recipe = compile_recipe(open(recipe_arg).read()) except: import traceback traceback.print_exc() @@ -108,15 +107,13 @@ def main(args=sys.argv, notification=None, handler=None): else: raise Exception('not file') except: - recipe = get_builtin_recipe(args[1]) + recipe = get_builtin_recipe(recipe_arg) if recipe is None: - recipe = compile_recipe(args[1]) + recipe = compile_recipe(recipe_arg) if recipe is None: - p.print_help() - print - print args[1], 'is an invalid recipe' - return 1 + raise RecipeError(recipe_arg+ ' is an invalid recipe') + if handler is None: from libprs500 import ColoredFormatter @@ -125,9 +122,23 @@ def main(args=sys.argv, notification=None, handler=None): handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar logging.getLogger('feeds2disk').addHandler(handler) - recipe = recipe(opts, p, notification) + recipe = recipe(opts, parser, notification) + if not os.path.exists(recipe.output_dir): + os.makedirs(recipe.output_dir) recipe.download() - + + return recipe + +def main(args=sys.argv, notification=None, handler=None): + p = option_parser() + opts, args = p.parse_args(args) + + if len(args) != 2 and opts.feeds is None: + p.print_help() + return 1 + recipe_arg = args[1] if len(args) > 1 else None + run_recipe(opts, recipe_arg, p, notification=notification, handler=handler) + return 0 if __name__ == '__main__': diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py index 98e2405c72..4550e34fcc 100644 --- a/src/libprs500/web/feeds/news.py +++ b/src/libprs500/web/feeds/news.py @@ -165,6 +165,8 @@ class BasicNewsRecipe(object): ''' if not self.feeds: raise NotImplementedError + if self.test: + return self.feeds[:2] return self.feeds @classmethod @@ -225,10 +227,13 @@ class BasicNewsRecipe(object): @param parser: Command line option parser. Used to intelligently merge options. @param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. ''' - for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'): + for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'): setattr(self, attr, getattr(options, attr)) self.output_dir = os.path.abspath(self.output_dir) - + if options.test: + self.max_articles_per_feed = 2 + self.simultaneous_downloads = min(4, self.simultaneous_downloads) + self.logger = logging.getLogger('feeds2disk') if self.debug: @@ -288,10 +293,12 @@ class BasicNewsRecipe(object): self.simultaneous_downloads = 1 self.navbar = templates.NavBarTemplate() - self.max_articles_per_feed -= 1 self.html2lrf_options.append('--use-spine') self.failed_downloads = [] self.partial_failures = [] + + + def _postprocess_html(self, soup): if self.extra_css is not None: @@ -383,6 +390,8 @@ class BasicNewsRecipe(object): def build_index(self): self.report_progress(0, _('Fetching feeds...')) feeds = self.parse_feeds() + if self.test: + feeds = feeds[:2] self.has_single_feed = len(feeds) == 1 index = os.path.join(self.output_dir, 'index.html') @@ -460,13 +469,14 @@ class BasicNewsRecipe(object): if dir is None: dir = self.output_dir mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__]) + mi.author_sort = __appname__ opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') opf = OPFCreator(dir, mi) - manifest = ['feed_%d'%i for i in range(len(feeds))] - manifest.append('index.html') + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) cpath = getattr(self, 'cover_path', None) if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath diff --git a/src/libprs500/web/feeds/recipes/newsweek.py b/src/libprs500/web/feeds/recipes/newsweek.py index 0313e52f33..8772e79325 100644 --- a/src/libprs500/web/feeds/recipes/newsweek.py +++ b/src/libprs500/web/feeds/recipes/newsweek.py @@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe): 'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen', ] - extra_css = '#content { font:serif 1.2em; }' + extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}' keep_only_tags = [dict(name='div', id='content')] remove_tags = [ @@ -54,11 +54,6 @@ class Newsweek(BasicNewsRecipe): recursions = 1 match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] - # For testing - #feeds = feeds[3:5] - #max_articles_per_feed = 2 - - def postprocess_html(self, soup): divs = list(soup.findAll('div', 'pagination')) diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py index b6622631e9..8d9b842a7c 100644 --- a/src/libprs500/web/fetch/simple.py +++ b/src/libprs500/web/fetch/simple.py @@ -37,10 +37,12 @@ def basename(url): return res def save_soup(soup, target): - nm = Tag(soup, '') - meta = soup.find('meta', content=True) - if meta and 'charset' in meta['content']: - meta.replaceWith(nm) + ns = BeautifulSoup('') + nm = ns.find('meta') + metas = soup.findAll('meta', content=True) + for meta in metas: + if 'charset' in meta['content']: + meta.replaceWith(nm) f = codecs.open(target, 'w', 'utf-8') f.write(unicode(soup)) f.close()