feeds2lrf

2025-07-07 18:24:30 -04:00 · 2008-03-15 01:43:20 +00:00 · 2008-03-15 01:43:20 +00:00 · 00b0cf46fc
commit 00b0cf46fc
parent fb53f18a8f
11 changed files with 206 additions and 70 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -18,7 +18,7 @@ __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'

-import sys, os, logging, mechanize, locale, cStringIO, re, subprocess, textwrap
+import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, textwrap
 from gettext import GNUTranslations
 from math import floor
 from optparse import OptionParser as _OptionParser
@ -143,38 +143,64 @@ class OptionParser(_OptionParser):
            raise Exception(msg)
        _OptionParser.error(self, msg)
        
+    def merge(self, parser):
+        '''
+        Add options from parser to self. In case of conflicts, confilicting options from
+        parser are skipped.
+        '''
+        opts   = list(parser.option_list)
+        groups = list(parser.option_groups)
+        
+        def merge_options(options, container):
+            for opt in copy.deepcopy(options):
+                if not self.has_option(opt.get_opt_string()):
+                    container.add_option(opt)
+                
+        merge_options(opts, self)
+        
+        for group in groups:
+            g = self.add_option_group(group.title)
+            merge_options(group.option_list, g)
+        
    def subsume(self, group_name, msg=''):
        '''
        Move all existing options into a subgroup named
        C{group_name} with description C{msg}.
        '''
-        opts   = list(self.option_list)
-        groups = list(self.option_groups)
-        exclude = []
-        
-        for opt in opts:
-            ops = opt.get_opt_string()
-            if ops in ('--help', '--version'):
-                exclude.append(opt)
-            else:
-                self.remove_option(ops)
-        for group in groups:
-            for opt in group.option_list:
-                opts.append(opt) 
-                group.remove_option(opt)
-        
+        opts = [opt for opt in self.options_iter() if opt.get_opt_string() not in ('--version', '--help')]
        self.option_groups = []
        subgroup = self.add_option_group(group_name, msg)
        for opt in opts:
-            if opt in exclude:
-                continue
+            self.remove_option(opt.get_opt_string())
            subgroup.add_option(opt)
        
-            
-        
-        
-            
-            
+    def options_iter(self):
+        for opt in self.option_list:
+            if str(opt).strip():
+                yield opt
+        for gr in self.option_groups:
+            for opt in gr.option_list:
+                if str(opt).strip():
+                    yield opt
+                
+    def option_by_dest(self, dest):
+        for opt in self.options_iter():
+            if opt.dest == dest:
+                return opt
+    
+    def merge_options(self, lower, upper):
+        '''
+        Merge options in lower and upper option lists into upper.
+        Default values in upper are overriden by
+        non default values in lower.
+        '''
+        for dest in lower.__dict__.keys():
+            if not upper.__dict__.has_key(dest):
+                continue
+            opt = self.option_by_dest(dest)
+            if lower.__dict__[dest] != opt.default and \
+               upper.__dict__[dest] == opt.default:
+                upper.__dict__[dest] = lower.__dict__[dest]
        

 def load_library(name, cdll):
--- a/src/libprs500/ebooks/lrf/feeds/init.py
+++ b/src/libprs500/ebooks/lrf/feeds/init.py
@ -0,0 +1,16 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
--- a/src/libprs500/ebooks/lrf/feeds/convert_from.py
+++ b/src/libprs500/ebooks/lrf/feeds/convert_from.py
@ -0,0 +1,71 @@
+#!/usr/bin/env  python
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Convert web feeds to LRF files.
+'''
+from libprs500.ebooks.lrf import option_parser as lrf_option_parser
+from libprs500.ebooks.lrf.html.convert_from import process_file
+from libprs500.web.feeds.main import option_parser as feeds_option_parser
+from libprs500.web.feeds.main import run_recipe
+from libprs500.ptempfile import PersistentTemporaryDirectory
+from libprs500 import sanitize_file_name
+
+import sys, os, time
+
+def option_parser():
+    parser = feeds_option_parser()
+    parser.remove_option('--output-dir')
+    parser.remove_option('--lrf')
+    parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
+    lrf_parser = lrf_option_parser('')
+    lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
+    parser.merge(lrf_parser)
+    return parser
+
+def main(args=sys.argv, notification=None, handler=None):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    opts.lrf = True
+    
+    if len(args) != 2 and opts.feeds is None:
+        parser.print_help()
+        return 1
+    
+    recipe_arg = args[1] if len(args) > 1 else None
+    
+    tdir            = PersistentTemporaryDirectory('_feeds2lrf')
+    opts.output_dir = tdir 
+    
+    recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
+    
+    htmlfile = os.path.join(tdir, 'index.html')
+    if not os.access(htmlfile, os.R_OK):
+        raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
+    
+    lparser = lrf_option_parser('')
+    ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
+    parser.merge_options(ropts, opts)
+    
+    if not opts.output:
+        ext = '.lrs' if opts.lrs else '.lrf'
+        fname = recipe.title + time.strftime(recipe.timefmt)+ext
+        opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
+    print 'Generating LRF...'
+    process_file(htmlfile, opts)
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -1715,7 +1715,7 @@ def process_file(path, options, logger=None):
    
    tpath = '' 
    try_opf(path, options, logger)
-    if options.cover:
+    if getattr(options, 'cover', None):
        options.cover = os.path.expanduser(options.cover)            
        if not os.path.isabs(options.cover):
            options.cover = os.path.join(dirpath, options.cover)
@ -1750,7 +1750,7 @@ def process_file(path, options, logger=None):
        options.title = default_title
    
    for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
-        val = getattr(options, prop)
+        val = getattr(options, prop, None)
        if val and not isinstance(val, unicode):
            soup = BeautifulSoup(val)
            setattr(options, prop, unicode(soup))
@ -1822,13 +1822,14 @@ def try_opf(path, options, logger):
                break
    if opf is None:
        return
+    
    dirpath = os.path.dirname(os.path.abspath(opf))
    opf = OPFReader(open(opf, 'rb'), dirpath)    
    try:
        title = opf.title        
-        if title and not options.title:
+        if title and not getattr(options, 'title', None):
            options.title = title
-        if options.author == 'Unknown':
+        if getattr(options, 'author', 'Unknown') == 'Unknown':
            if opf.authors:
                options.author = ', '.join(opf.authors)
            if opf.author_sort:
@ -1837,12 +1838,12 @@ def try_opf(path, options, logger):
            publisher = opf.publisher
            if publisher:
                options.publisher = publisher
-        if not options.category:
+        if not getattr(options, 'category', None):
            category = opf.category
            if category:
                options.category = category
-        if not options.cover or options.use_metadata_cover:
-            orig_cover = options.cover
+        if not getattr(options, 'cover', None) or options.use_metadata_cover:
+            orig_cover = getattr(options, 'cover', None)
            options.cover = None
            cover = opf.cover            
            if cover:
@ -1865,10 +1866,10 @@ def try_opf(path, options, logger):
                                    break
                                except:
                                    continue
-            if not options.cover and orig_cover is not None:
+            if not getattr(options, 'cover', None) and orig_cover is not None:
                options.cover = orig_cover        
        options.spine = [i.href for i in opf.spine.items()]
-        if not hasattr(options, 'toc') or options.toc is None:
+        if not getattr(options, 'toc', None):
            options.toc   = opf.toc
    except Exception:
        logger.exception('Failed to process opf file')
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -503,7 +503,7 @@ class OPFReader(OPF):
            stream.close()
        self.manifest = Manifest(self.soup, dir)
        self.spine = Spine(self.soup, self.manifest)
-        self.toc = TOC()
+        self.toc = TOC(base_path=dir)
        self.toc.read_from_opf(self)
        self.cover_data = (None, None)
        
@ -554,13 +554,15 @@ class OPFCreator(MetaInformation):
        self.manifest = rentries
        
    def create_manifest_from_files_in(self, files_and_dirs):
+        #self.base_path = os.path.commonprefix(files_and_dirs)
        entries = []
        
        def dodir(dir):
            for root, dirs, files in os.walk(dir):
                for name in files:
                    path = os.path.join(root, name)
-                    entries.append((path, None)) 
+                    if os.path.isfile(path):
+                        entries.append((path, None)) 
        
        for i in files_and_dirs:
            if os.path.isdir(i):
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@ -166,6 +166,7 @@ def setup_completion(fatal_errors):
        from libprs500.ebooks.mobi.reader import option_parser as mobioeb
        from libprs500.web.feeds.main import option_parser as feeds2disk
        from libprs500.web.feeds.recipes import titles as feed_titles
+        from libprs500.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
        
        f = open_file('/etc/bash_completion.d/libprs500')
        
@ -191,6 +192,7 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
        f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
+        f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
        f.write('''
 _prs500_ls()
 {
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -79,7 +79,7 @@ class Feed(object):
        self.oldest_article = oldest_article
        
        for item in entries:
-            if len(self.articles) > max_articles_per_feed:
+            if len(self.articles) >= max_articles_per_feed:
                break
            self.parse_article(item)

--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@ -41,7 +41,7 @@ Available builtin recipes are:
    p.remove_option('--base-dir')
    p.remove_option('--verbose')
    p.remove_option('--max-files')
-    p.subsume('WEB2DISK OPTIONS', 'Options to control web2disk (used to fetch websites linked from feeds)')
+    p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
    
    p.add_option('--feeds', default=None,
                 help=_('''Specify a list of feeds to download. For example: 
@ -50,7 +50,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
    p.add_option('--verbose', default=False, action='store_true',
                 help=_('''Be more verbose while processing.'''))
    p.add_option('--title', default=None,
-                 help='The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')
+                 help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
    p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.'))
    p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.'))
    p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
@ -61,7 +61,9 @@ If you specify this option, any argument to %prog is ignored and a default recip
    p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
                 help=_('Dont show the progress bar'))
    p.add_option('--debug', action='store_true', default=False,
-                 help='Very verbose output, useful for debugging.')
+                 help=_('Very verbose output, useful for debugging.'))
+    p.add_option('--test', action='store_true', default=False, 
+                 help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
    
    return p
    
@ -72,10 +74,10 @@ def simple_progress_bar(percent, msg):
 def no_progress_bar(percent, msg):
    print msg

-def main(args=sys.argv, notification=None, handler=None):
-    p = option_parser()
-    opts, args = p.parse_args(args)
-    
+class RecipeError(Exception):
+    pass
+
+def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
    if notification is None:
        from libprs500.terminfo import TerminalController, ProgressBar
        term = TerminalController(sys.stdout)
@ -89,18 +91,15 @@ def main(args=sys.argv, notification=None, handler=None):
        else:
            notification = no_progress_bar
        
-    if len(args) != 2 and opts.feeds is None:
-        p.print_help()
-        return 1
    
    recipe = None
    if opts.feeds is not None:
        recipe = BasicNewsRecipe
    else:
        try:
-            if os.access(args[1], os.R_OK):
+            if os.access(recipe_arg, os.R_OK):
                try:
-                    recipe = compile_recipe(open(args[1]).read())
+                    recipe = compile_recipe(open(recipe_arg).read())
                except:
                    import traceback
                    traceback.print_exc()
@ -108,15 +107,13 @@ def main(args=sys.argv, notification=None, handler=None):
            else:
                raise Exception('not file')
        except:
-            recipe = get_builtin_recipe(args[1])
+            recipe = get_builtin_recipe(recipe_arg)
            if recipe is None:
-                recipe = compile_recipe(args[1])
+                recipe = compile_recipe(recipe_arg)
    
    if recipe is None:
-        p.print_help()
-        print
-        print args[1], 'is an invalid recipe'
-        return 1
+        raise RecipeError(recipe_arg+ ' is an invalid recipe')
+        
    
    if handler is None:
        from libprs500 import ColoredFormatter
@ -125,9 +122,23 @@ def main(args=sys.argv, notification=None, handler=None):
        handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
        logging.getLogger('feeds2disk').addHandler(handler)
    
-    recipe = recipe(opts, p, notification)
+    recipe = recipe(opts, parser, notification)
+    if not os.path.exists(recipe.output_dir):
+        os.makedirs(recipe.output_dir)
    recipe.download()
-        
+    
+    return recipe
+
+def main(args=sys.argv, notification=None, handler=None):
+    p = option_parser()
+    opts, args = p.parse_args(args)
+    
+    if len(args) != 2 and opts.feeds is None:
+        p.print_help()
+        return 1
+    recipe_arg = args[1] if len(args) > 1 else None
+    run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)    
+            
    return 0

 if __name__ == '__main__':
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -165,6 +165,8 @@ class BasicNewsRecipe(object):
        '''
        if not self.feeds:
            raise NotImplementedError
+        if self.test:
+            return self.feeds[:2]
        return self.feeds
    
    @classmethod
@ -225,10 +227,13 @@ class BasicNewsRecipe(object):
        @param parser:  Command line option parser. Used to intelligently merge options.
        @param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
        '''
-        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'):
+        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
            setattr(self, attr, getattr(options, attr))
        self.output_dir = os.path.abspath(self.output_dir)
-        
+        if options.test:
+            self.max_articles_per_feed = 2
+            self.simultaneous_downloads = min(4, self.simultaneous_downloads)
+            
        self.logger = logging.getLogger('feeds2disk')
        
        if self.debug:
@ -288,10 +293,12 @@ class BasicNewsRecipe(object):
            self.simultaneous_downloads = 1
            
        self.navbar = templates.NavBarTemplate()
-        self.max_articles_per_feed -= 1
        self.html2lrf_options.append('--use-spine')
        self.failed_downloads = []
        self.partial_failures = []
+        
+        
+                
            
    def _postprocess_html(self, soup):
        if self.extra_css is not None:
@ -383,6 +390,8 @@ class BasicNewsRecipe(object):
    def build_index(self):
        self.report_progress(0, _('Fetching feeds...'))
        feeds = self.parse_feeds()
+        if self.test:
+            feeds = feeds[:2]
        self.has_single_feed = len(feeds) == 1
        
        index = os.path.join(self.output_dir, 'index.html') 
@ -460,13 +469,14 @@ class BasicNewsRecipe(object):
        if dir is None:
            dir = self.output_dir
        mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
+        mi.author_sort = __appname__
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
        opf = OPFCreator(dir, mi)
        
        
-        manifest = ['feed_%d'%i for i in range(len(feeds))]
-        manifest.append('index.html')
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        manifest.append(os.path.join(dir, 'index.html'))
        cpath = getattr(self, 'cover_path', None) 
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
             'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
             ]
    
-    extra_css = '#content { font:serif 1.2em; }'
+    extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}'
    keep_only_tags = [dict(name='div', id='content')]

    remove_tags = [
@ -54,11 +54,6 @@ class Newsweek(BasicNewsRecipe):
    recursions = 1
    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
    
-    # For testing
-    #feeds = feeds[3:5]
-    #max_articles_per_feed = 2
-    
-    
    
    def postprocess_html(self,  soup):
        divs = list(soup.findAll('div', 'pagination'))
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -37,10 +37,12 @@ def basename(url):
    return res

 def save_soup(soup, target):
-    nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    meta = soup.find('meta', content=True)
-    if meta and 'charset' in meta['content']:
-        meta.replaceWith(nm)
+    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+    nm = ns.find('meta')
+    metas = soup.findAll('meta', content=True)
+    for meta in metas:
+        if 'charset' in meta['content']:
+            meta.replaceWith(nm)
    f = codecs.open(target, 'w', 'utf-8')
    f.write(unicode(soup))
    f.close()