From 00b0cf46fc298d1ed95dd1774e6dbbbc947f85b7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 15 Mar 2008 01:43:20 +0000
Subject: [PATCH] feeds2lrf

---
 src/libprs500/__init__.py                     | 72 +++++++++++++------
 src/libprs500/ebooks/lrf/feeds/__init__.py    | 16 +++++
 .../ebooks/lrf/feeds/convert_from.py          | 71 ++++++++++++++++++
 src/libprs500/ebooks/lrf/html/convert_from.py | 19 ++---
 src/libprs500/ebooks/metadata/opf.py          |  6 +-
 src/libprs500/linux.py                        |  2 +
 src/libprs500/web/feeds/__init__.py           |  2 +-
 src/libprs500/web/feeds/main.py               | 51 +++++++------
 src/libprs500/web/feeds/news.py               | 20 ++++--
 src/libprs500/web/feeds/recipes/newsweek.py   |  7 +-
 src/libprs500/web/fetch/simple.py             | 10 +--
 11 files changed, 206 insertions(+), 70 deletions(-)
 create mode 100644 src/libprs500/ebooks/lrf/feeds/__init__.py
 create mode 100644 src/libprs500/ebooks/lrf/feeds/convert_from.py

diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py
index 0374dc3adc..f1547982d4 100644
--- a/src/libprs500/__init__.py
+++ b/src/libprs500/__init__.py
@@ -18,7 +18,7 @@ __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
 
-import sys, os, logging, mechanize, locale, cStringIO, re, subprocess, textwrap
+import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, textwrap
 from gettext import GNUTranslations
 from math import floor
 from optparse import OptionParser as _OptionParser
@@ -143,38 +143,64 @@ class OptionParser(_OptionParser):
             raise Exception(msg)
         _OptionParser.error(self, msg)
         
+    def merge(self, parser):
+        '''
+        Add options from parser to self. In case of conflicts, confilicting options from
+        parser are skipped.
+        '''
+        opts   = list(parser.option_list)
+        groups = list(parser.option_groups)
+        
+        def merge_options(options, container):
+            for opt in copy.deepcopy(options):
+                if not self.has_option(opt.get_opt_string()):
+                    container.add_option(opt)
+                
+        merge_options(opts, self)
+        
+        for group in groups:
+            g = self.add_option_group(group.title)
+            merge_options(group.option_list, g)
+        
     def subsume(self, group_name, msg=''):
         '''
         Move all existing options into a subgroup named
         C{group_name} with description C{msg}.
         '''
-        opts   = list(self.option_list)
-        groups = list(self.option_groups)
-        exclude = []
-        
-        for opt in opts:
-            ops = opt.get_opt_string()
-            if ops in ('--help', '--version'):
-                exclude.append(opt)
-            else:
-                self.remove_option(ops)
-        for group in groups:
-            for opt in group.option_list:
-                opts.append(opt) 
-                group.remove_option(opt)
-        
+        opts = [opt for opt in self.options_iter() if opt.get_opt_string() not in ('--version', '--help')]
         self.option_groups = []
         subgroup = self.add_option_group(group_name, msg)
         for opt in opts:
-            if opt in exclude:
-                continue
+            self.remove_option(opt.get_opt_string())
             subgroup.add_option(opt)
         
-            
-        
-        
-            
-            
+    def options_iter(self):
+        for opt in self.option_list:
+            if str(opt).strip():
+                yield opt
+        for gr in self.option_groups:
+            for opt in gr.option_list:
+                if str(opt).strip():
+                    yield opt
+                
+    def option_by_dest(self, dest):
+        for opt in self.options_iter():
+            if opt.dest == dest:
+                return opt
+    
+    def merge_options(self, lower, upper):
+        '''
+        Merge options in lower and upper option lists into upper.
+        Default values in upper are overriden by
+        non default values in lower.
+        '''
+        for dest in lower.__dict__.keys():
+            if not upper.__dict__.has_key(dest):
+                continue
+            opt = self.option_by_dest(dest)
+            if lower.__dict__[dest] != opt.default and \
+               upper.__dict__[dest] == opt.default:
+                upper.__dict__[dest] = lower.__dict__[dest]
         
 
 def load_library(name, cdll):
diff --git a/src/libprs500/ebooks/lrf/feeds/__init__.py b/src/libprs500/ebooks/lrf/feeds/__init__.py
new file mode 100644
index 0000000000..86475d028f
--- /dev/null
+++ b/src/libprs500/ebooks/lrf/feeds/__init__.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
diff --git a/src/libprs500/ebooks/lrf/feeds/convert_from.py b/src/libprs500/ebooks/lrf/feeds/convert_from.py
new file mode 100644
index 0000000000..b55e20d3bc
--- /dev/null
+++ b/src/libprs500/ebooks/lrf/feeds/convert_from.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env  python
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Convert web feeds to LRF files.
+'''
+from libprs500.ebooks.lrf import option_parser as lrf_option_parser
+from libprs500.ebooks.lrf.html.convert_from import process_file
+from libprs500.web.feeds.main import option_parser as feeds_option_parser
+from libprs500.web.feeds.main import run_recipe
+from libprs500.ptempfile import PersistentTemporaryDirectory
+from libprs500 import sanitize_file_name
+
+import sys, os, time
+
+def option_parser():
+    parser = feeds_option_parser()
+    parser.remove_option('--output-dir')
+    parser.remove_option('--lrf')
+    parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
+    lrf_parser = lrf_option_parser('')
+    lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
+    parser.merge(lrf_parser)
+    return parser
+
+def main(args=sys.argv, notification=None, handler=None):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    opts.lrf = True
+    
+    if len(args) != 2 and opts.feeds is None:
+        parser.print_help()
+        return 1
+    
+    recipe_arg = args[1] if len(args) > 1 else None
+    
+    tdir            = PersistentTemporaryDirectory('_feeds2lrf')
+    opts.output_dir = tdir 
+    
+    recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
+    
+    htmlfile = os.path.join(tdir, 'index.html')
+    if not os.access(htmlfile, os.R_OK):
+        raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
+    
+    lparser = lrf_option_parser('')
+    ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
+    parser.merge_options(ropts, opts)
+    
+    if not opts.output:
+        ext = '.lrs' if opts.lrs else '.lrf'
+        fname = recipe.title + time.strftime(recipe.timefmt)+ext
+        opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
+    print 'Generating LRF...'
+    process_file(htmlfile, opts)
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
\ No newline at end of file
diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index 6ec3f06c53..0e95ca0e81 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -1715,7 +1715,7 @@ def process_file(path, options, logger=None):
     
     tpath = '' 
     try_opf(path, options, logger)
-    if options.cover:
+    if getattr(options, 'cover', None):
         options.cover = os.path.expanduser(options.cover)            
         if not os.path.isabs(options.cover):
             options.cover = os.path.join(dirpath, options.cover)
@@ -1750,7 +1750,7 @@ def process_file(path, options, logger=None):
         options.title = default_title
     
     for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
-        val = getattr(options, prop)
+        val = getattr(options, prop, None)
         if val and not isinstance(val, unicode):
             soup = BeautifulSoup(val)
             setattr(options, prop, unicode(soup))
@@ -1822,13 +1822,14 @@ def try_opf(path, options, logger):
                 break
     if opf is None:
         return
+    
     dirpath = os.path.dirname(os.path.abspath(opf))
     opf = OPFReader(open(opf, 'rb'), dirpath)    
     try:
         title = opf.title        
-        if title and not options.title:
+        if title and not getattr(options, 'title', None):
             options.title = title
-        if options.author == 'Unknown':
+        if getattr(options, 'author', 'Unknown') == 'Unknown':
             if opf.authors:
                 options.author = ', '.join(opf.authors)
             if opf.author_sort:
@@ -1837,12 +1838,12 @@ def try_opf(path, options, logger):
             publisher = opf.publisher
             if publisher:
                 options.publisher = publisher
-        if not options.category:
+        if not getattr(options, 'category', None):
             category = opf.category
             if category:
                 options.category = category
-        if not options.cover or options.use_metadata_cover:
-            orig_cover = options.cover
+        if not getattr(options, 'cover', None) or options.use_metadata_cover:
+            orig_cover = getattr(options, 'cover', None)
             options.cover = None
             cover = opf.cover            
             if cover:
@@ -1865,10 +1866,10 @@ def try_opf(path, options, logger):
                                     break
                                 except:
                                     continue
-            if not options.cover and orig_cover is not None:
+            if not getattr(options, 'cover', None) and orig_cover is not None:
                 options.cover = orig_cover        
         options.spine = [i.href for i in opf.spine.items()]
-        if not hasattr(options, 'toc') or options.toc is None:
+        if not getattr(options, 'toc', None):
             options.toc   = opf.toc
     except Exception:
         logger.exception('Failed to process opf file')
diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py
index c4b627cf71..73d88f7145 100644
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@@ -503,7 +503,7 @@ class OPFReader(OPF):
             stream.close()
         self.manifest = Manifest(self.soup, dir)
         self.spine = Spine(self.soup, self.manifest)
-        self.toc = TOC()
+        self.toc = TOC(base_path=dir)
         self.toc.read_from_opf(self)
         self.cover_data = (None, None)
         
@@ -554,13 +554,15 @@ class OPFCreator(MetaInformation):
         self.manifest = rentries
         
     def create_manifest_from_files_in(self, files_and_dirs):
+        #self.base_path = os.path.commonprefix(files_and_dirs)
         entries = []
         
         def dodir(dir):
             for root, dirs, files in os.walk(dir):
                 for name in files:
                     path = os.path.join(root, name)
-                    entries.append((path, None)) 
+                    if os.path.isfile(path):
+                        entries.append((path, None)) 
         
         for i in files_and_dirs:
             if os.path.isdir(i):
diff --git a/src/libprs500/linux.py b/src/libprs500/linux.py
index de3846f4a5..d883434d6a 100644
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@@ -166,6 +166,7 @@ def setup_completion(fatal_errors):
         from libprs500.ebooks.mobi.reader import option_parser as mobioeb
         from libprs500.web.feeds.main import option_parser as feeds2disk
         from libprs500.web.feeds.recipes import titles as feed_titles
+        from libprs500.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
         
         f = open_file('/etc/bash_completion.d/libprs500')
         
@@ -191,6 +192,7 @@ def setup_completion(fatal_errors):
         f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
         f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
         f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
+        f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
         f.write('''
 _prs500_ls()
 {
diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py
index 1a3083131d..fb551febd4 100644
--- a/src/libprs500/web/feeds/__init__.py
+++ b/src/libprs500/web/feeds/__init__.py
@@ -79,7 +79,7 @@ class Feed(object):
         self.oldest_article = oldest_article
         
         for item in entries:
-            if len(self.articles) > max_articles_per_feed:
+            if len(self.articles) >= max_articles_per_feed:
                 break
             self.parse_article(item)
 
diff --git a/src/libprs500/web/feeds/main.py b/src/libprs500/web/feeds/main.py
index 8e2d7d0701..9c34614e28 100644
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@@ -41,7 +41,7 @@ Available builtin recipes are:
     p.remove_option('--base-dir')
     p.remove_option('--verbose')
     p.remove_option('--max-files')
-    p.subsume('WEB2DISK OPTIONS', 'Options to control web2disk (used to fetch websites linked from feeds)')
+    p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
     
     p.add_option('--feeds', default=None,
                  help=_('''Specify a list of feeds to download. For example: 
@@ -50,7 +50,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
     p.add_option('--verbose', default=False, action='store_true',
                  help=_('''Be more verbose while processing.'''))
     p.add_option('--title', default=None,
-                 help='The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')
+                 help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
     p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.'))
     p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.'))
     p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
@@ -61,7 +61,9 @@ If you specify this option, any argument to %prog is ignored and a default recip
     p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
                  help=_('Dont show the progress bar'))
     p.add_option('--debug', action='store_true', default=False,
-                 help='Very verbose output, useful for debugging.')
+                 help=_('Very verbose output, useful for debugging.'))
+    p.add_option('--test', action='store_true', default=False, 
+                 help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
     
     return p
     
@@ -72,10 +74,10 @@ def simple_progress_bar(percent, msg):
 def no_progress_bar(percent, msg):
     print msg
 
-def main(args=sys.argv, notification=None, handler=None):
-    p = option_parser()
-    opts, args = p.parse_args(args)
-    
+class RecipeError(Exception):
+    pass
+
+def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
     if notification is None:
         from libprs500.terminfo import TerminalController, ProgressBar
         term = TerminalController(sys.stdout)
@@ -89,18 +91,15 @@ def main(args=sys.argv, notification=None, handler=None):
         else:
             notification = no_progress_bar
         
-    if len(args) != 2 and opts.feeds is None:
-        p.print_help()
-        return 1
     
     recipe = None
     if opts.feeds is not None:
         recipe = BasicNewsRecipe
     else:
         try:
-            if os.access(args[1], os.R_OK):
+            if os.access(recipe_arg, os.R_OK):
                 try:
-                    recipe = compile_recipe(open(args[1]).read())
+                    recipe = compile_recipe(open(recipe_arg).read())
                 except:
                     import traceback
                     traceback.print_exc()
@@ -108,15 +107,13 @@ def main(args=sys.argv, notification=None, handler=None):
             else:
                 raise Exception('not file')
         except:
-            recipe = get_builtin_recipe(args[1])
+            recipe = get_builtin_recipe(recipe_arg)
             if recipe is None:
-                recipe = compile_recipe(args[1])
+                recipe = compile_recipe(recipe_arg)
     
     if recipe is None:
-        p.print_help()
-        print
-        print args[1], 'is an invalid recipe'
-        return 1
+        raise RecipeError(recipe_arg+ ' is an invalid recipe')
+        
     
     if handler is None:
         from libprs500 import ColoredFormatter
@@ -125,9 +122,23 @@ def main(args=sys.argv, notification=None, handler=None):
         handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
         logging.getLogger('feeds2disk').addHandler(handler)
     
-    recipe = recipe(opts, p, notification)
+    recipe = recipe(opts, parser, notification)
+    if not os.path.exists(recipe.output_dir):
+        os.makedirs(recipe.output_dir)
     recipe.download()
-        
+    
+    return recipe
+
+def main(args=sys.argv, notification=None, handler=None):
+    p = option_parser()
+    opts, args = p.parse_args(args)
+    
+    if len(args) != 2 and opts.feeds is None:
+        p.print_help()
+        return 1
+    recipe_arg = args[1] if len(args) > 1 else None
+    run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)    
+            
     return 0
 
 if __name__ == '__main__':
diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py
index 98e2405c72..4550e34fcc 100644
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@@ -165,6 +165,8 @@ class BasicNewsRecipe(object):
         '''
         if not self.feeds:
             raise NotImplementedError
+        if self.test:
+            return self.feeds[:2]
         return self.feeds
     
     @classmethod
@@ -225,10 +227,13 @@ class BasicNewsRecipe(object):
         @param parser:  Command line option parser. Used to intelligently merge options.
         @param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
         '''
-        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'):
+        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
             setattr(self, attr, getattr(options, attr))
         self.output_dir = os.path.abspath(self.output_dir)
-        
+        if options.test:
+            self.max_articles_per_feed = 2
+            self.simultaneous_downloads = min(4, self.simultaneous_downloads)
+            
         self.logger = logging.getLogger('feeds2disk')
         
         if self.debug:
@@ -288,10 +293,12 @@ class BasicNewsRecipe(object):
             self.simultaneous_downloads = 1
             
         self.navbar = templates.NavBarTemplate()
-        self.max_articles_per_feed -= 1
         self.html2lrf_options.append('--use-spine')
         self.failed_downloads = []
         self.partial_failures = []
+        
+        
+                
             
     def _postprocess_html(self, soup):
         if self.extra_css is not None:
@@ -383,6 +390,8 @@ class BasicNewsRecipe(object):
     def build_index(self):
         self.report_progress(0, _('Fetching feeds...'))
         feeds = self.parse_feeds()
+        if self.test:
+            feeds = feeds[:2]
         self.has_single_feed = len(feeds) == 1
         
         index = os.path.join(self.output_dir, 'index.html') 
@@ -460,13 +469,14 @@ class BasicNewsRecipe(object):
         if dir is None:
             dir = self.output_dir
         mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
+        mi.author_sort = __appname__
         opf_path = os.path.join(dir, 'index.opf')
         ncx_path = os.path.join(dir, 'index.ncx')
         opf = OPFCreator(dir, mi)
         
         
-        manifest = ['feed_%d'%i for i in range(len(feeds))]
-        manifest.append('index.html')
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        manifest.append(os.path.join(dir, 'index.html'))
         cpath = getattr(self, 'cover_path', None) 
         if cpath is not None and os.access(cpath, os.R_OK):
             opf.cover = cpath
diff --git a/src/libprs500/web/feeds/recipes/newsweek.py b/src/libprs500/web/feeds/recipes/newsweek.py
index 0313e52f33..8772e79325 100644
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
              'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
              ]
     
-    extra_css = '#content { font:serif 1.2em; }'
+    extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}'
     keep_only_tags = [dict(name='div', id='content')]
 
     remove_tags = [
@@ -54,11 +54,6 @@ class Newsweek(BasicNewsRecipe):
     recursions = 1
     match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
     
-    # For testing
-    #feeds = feeds[3:5]
-    #max_articles_per_feed = 2
-    
-    
     
     def postprocess_html(self,  soup):
         divs = list(soup.findAll('div', 'pagination'))
diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py
index b6622631e9..8d9b842a7c 100644
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@@ -37,10 +37,12 @@ def basename(url):
     return res
 
 def save_soup(soup, target):
-    nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    meta = soup.find('meta', content=True)
-    if meta and 'charset' in meta['content']:
-        meta.replaceWith(nm)
+    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+    nm = ns.find('meta')
+    metas = soup.findAll('meta', content=True)
+    for meta in metas:
+        if 'charset' in meta['content']:
+            meta.replaceWith(nm)
     f = codecs.open(target, 'w', 'utf-8')
     f.write(unicode(soup))
     f.close()