feeds2lrf

2025-07-08 02:34:06 -04:00 · 2008-03-15 01:43:20 +00:00 · 2008-03-15 01:43:20 +00:00 · 00b0cf46fc
commit 00b0cf46fc
parent fb53f18a8f
11 changed files with 206 additions and 70 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -18,7 +18,7 @@ __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
-import sys, os, logging, mechanize, locale, cStringIO, re, subprocess, textwrap
+import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, textwrap
 from gettext import GNUTranslations
 from math import floor
 from optparse import OptionParser as _OptionParser
@ -143,38 +143,64 @@ class OptionParser(_OptionParser):
            raise Exception(msg)
        _OptionParser.error(self, msg)
    def merge(self, parser):
        '''
        Add options from parser to self. In case of conflicts, confilicting options from
        parser are skipped.
        '''
        opts   = list(parser.option_list)
        groups = list(parser.option_groups)
        def merge_options(options, container):
            for opt in copy.deepcopy(options):
                if not self.has_option(opt.get_opt_string()):
                    container.add_option(opt)
        merge_options(opts, self)
        for group in groups:
            g = self.add_option_group(group.title)
            merge_options(group.option_list, g)
    def subsume(self, group_name, msg=''):
        '''
        Move all existing options into a subgroup named
        C{group_name} with description C{msg}.
        '''
-        opts   = list(self.option_list)
+        opts = [opt for opt in self.options_iter() if opt.get_opt_string() not in ('--version', '--help')]
        groups = list(self.option_groups)
        exclude = []
        for opt in opts:
            ops = opt.get_opt_string()
            if ops in ('--help', '--version'):
                exclude.append(opt)
            else:
                self.remove_option(ops)
        for group in groups:
            for opt in group.option_list:
                opts.append(opt) 
                group.remove_option(opt)
        self.option_groups = []
        subgroup = self.add_option_group(group_name, msg)
        for opt in opts:
-            if opt in exclude:
+            self.remove_option(opt.get_opt_string())
                continue
            subgroup.add_option(opt)
    def options_iter(self):
        for opt in self.option_list:
            if str(opt).strip():
                yield opt
        for gr in self.option_groups:
            for opt in gr.option_list:
                if str(opt).strip():
                    yield opt
    def option_by_dest(self, dest):
        for opt in self.options_iter():
            if opt.dest == dest:
                return opt
-        
+    def merge_options(self, lower, upper):
-            
+        '''
-            
+        Merge options in lower and upper option lists into upper.
        Default values in upper are overriden by
        non default values in lower.
        '''
        for dest in lower.__dict__.keys():
            if not upper.__dict__.has_key(dest):
                continue
            opt = self.option_by_dest(dest)
            if lower.__dict__[dest] != opt.default and \
               upper.__dict__[dest] == opt.default:
                upper.__dict__[dest] = lower.__dict__[dest]
 def load_library(name, cdll):
--- a/src/libprs500/ebooks/lrf/feeds/init.py
+++ b/src/libprs500/ebooks/lrf/feeds/init.py
@ -0,0 +1,16 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
--- a/src/libprs500/ebooks/lrf/feeds/convert_from.py
+++ b/src/libprs500/ebooks/lrf/feeds/convert_from.py
@ -0,0 +1,71 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Convert web feeds to LRF files.
 '''
 from libprs500.ebooks.lrf import option_parser as lrf_option_parser
 from libprs500.ebooks.lrf.html.convert_from import process_file
 from libprs500.web.feeds.main import option_parser as feeds_option_parser
 from libprs500.web.feeds.main import run_recipe
 from libprs500.ptempfile import PersistentTemporaryDirectory
 from libprs500 import sanitize_file_name
 import sys, os, time
 def option_parser():
    parser = feeds_option_parser()
    parser.remove_option('--output-dir')
    parser.remove_option('--lrf')
    parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
    lrf_parser = lrf_option_parser('')
    lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
    parser.merge(lrf_parser)
    return parser
 def main(args=sys.argv, notification=None, handler=None):
    parser = option_parser()
    opts, args = parser.parse_args(args)
    opts.lrf = True
    if len(args) != 2 and opts.feeds is None:
        parser.print_help()
        return 1
    recipe_arg = args[1] if len(args) > 1 else None
    tdir            = PersistentTemporaryDirectory('_feeds2lrf')
    opts.output_dir = tdir 
    recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
    htmlfile = os.path.join(tdir, 'index.html')
    if not os.access(htmlfile, os.R_OK):
        raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
    lparser = lrf_option_parser('')
    ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
    parser.merge_options(ropts, opts)
    if not opts.output:
        ext = '.lrs' if opts.lrs else '.lrf'
        fname = recipe.title + time.strftime(recipe.timefmt)+ext
        opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
    print 'Generating LRF...'
    process_file(htmlfile, opts)
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -1715,7 +1715,7 @@ def process_file(path, options, logger=None):
    tpath = '' 
    try_opf(path, options, logger)
-    if options.cover:
+    if getattr(options, 'cover', None):
        options.cover = os.path.expanduser(options.cover)            
        if not os.path.isabs(options.cover):
            options.cover = os.path.join(dirpath, options.cover)
@ -1750,7 +1750,7 @@ def process_file(path, options, logger=None):
        options.title = default_title
    for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
-        val = getattr(options, prop)
+        val = getattr(options, prop, None)
        if val and not isinstance(val, unicode):
            soup = BeautifulSoup(val)
            setattr(options, prop, unicode(soup))
@ -1822,13 +1822,14 @@ def try_opf(path, options, logger):
                break
    if opf is None:
        return
    dirpath = os.path.dirname(os.path.abspath(opf))
    opf = OPFReader(open(opf, 'rb'), dirpath)    
    try:
        title = opf.title        
-        if title and not options.title:
+        if title and not getattr(options, 'title', None):
            options.title = title
-        if options.author == 'Unknown':
+        if getattr(options, 'author', 'Unknown') == 'Unknown':
            if opf.authors:
                options.author = ', '.join(opf.authors)
            if opf.author_sort:
@ -1837,12 +1838,12 @@ def try_opf(path, options, logger):
            publisher = opf.publisher
            if publisher:
                options.publisher = publisher
-        if not options.category:
+        if not getattr(options, 'category', None):
            category = opf.category
            if category:
                options.category = category
-        if not options.cover or options.use_metadata_cover:
+        if not getattr(options, 'cover', None) or options.use_metadata_cover:
-            orig_cover = options.cover
+            orig_cover = getattr(options, 'cover', None)
            options.cover = None
            cover = opf.cover            
            if cover:
@ -1865,10 +1866,10 @@ def try_opf(path, options, logger):
                                    break
                                except:
                                    continue
-            if not options.cover and orig_cover is not None:
+            if not getattr(options, 'cover', None) and orig_cover is not None:
                options.cover = orig_cover        
        options.spine = [i.href for i in opf.spine.items()]
-        if not hasattr(options, 'toc') or options.toc is None:
+        if not getattr(options, 'toc', None):
            options.toc   = opf.toc
    except Exception:
        logger.exception('Failed to process opf file')
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -503,7 +503,7 @@ class OPFReader(OPF):
            stream.close()
        self.manifest = Manifest(self.soup, dir)
        self.spine = Spine(self.soup, self.manifest)
-        self.toc = TOC()
+        self.toc = TOC(base_path=dir)
        self.toc.read_from_opf(self)
        self.cover_data = (None, None)
@ -554,12 +554,14 @@ class OPFCreator(MetaInformation):
        self.manifest = rentries
    def create_manifest_from_files_in(self, files_and_dirs):
        #self.base_path = os.path.commonprefix(files_and_dirs)
        entries = []
        def dodir(dir):
            for root, dirs, files in os.walk(dir):
                for name in files:
                    path = os.path.join(root, name)
                    if os.path.isfile(path):
                        entries.append((path, None)) 
        for i in files_and_dirs:
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@ -166,6 +166,7 @@ def setup_completion(fatal_errors):
        from libprs500.ebooks.mobi.reader import option_parser as mobioeb
        from libprs500.web.feeds.main import option_parser as feeds2disk
        from libprs500.web.feeds.recipes import titles as feed_titles
        from libprs500.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
        f = open_file('/etc/bash_completion.d/libprs500')
@ -191,6 +192,7 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
        f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
        f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
        f.write('''
 _prs500_ls()
 {
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -79,7 +79,7 @@ class Feed(object):
        self.oldest_article = oldest_article
        for item in entries:
-            if len(self.articles) > max_articles_per_feed:
+            if len(self.articles) >= max_articles_per_feed:
                break
            self.parse_article(item)
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@ -41,7 +41,7 @@ Available builtin recipes are:
    p.remove_option('--base-dir')
    p.remove_option('--verbose')
    p.remove_option('--max-files')
-    p.subsume('WEB2DISK OPTIONS', 'Options to control web2disk (used to fetch websites linked from feeds)')
+    p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
    p.add_option('--feeds', default=None,
                 help=_('''Specify a list of feeds to download. For example: 
@ -50,7 +50,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
    p.add_option('--verbose', default=False, action='store_true',
                 help=_('''Be more verbose while processing.'''))
    p.add_option('--title', default=None,
-                 help='The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')
+                 help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
    p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.'))
    p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.'))
    p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
@ -61,7 +61,9 @@ If you specify this option, any argument to %prog is ignored and a default recip
    p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
                 help=_('Dont show the progress bar'))
    p.add_option('--debug', action='store_true', default=False,
-                 help='Very verbose output, useful for debugging.')
+                 help=_('Very verbose output, useful for debugging.'))
    p.add_option('--test', action='store_true', default=False, 
                 help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
    return p
@ -72,10 +74,10 @@ def simple_progress_bar(percent, msg):
 def no_progress_bar(percent, msg):
    print msg
-def main(args=sys.argv, notification=None, handler=None):
+class RecipeError(Exception):
-    p = option_parser()
+    pass
    opts, args = p.parse_args(args)
 def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
    if notification is None:
        from libprs500.terminfo import TerminalController, ProgressBar
        term = TerminalController(sys.stdout)
@ -89,18 +91,15 @@ def main(args=sys.argv, notification=None, handler=None):
        else:
            notification = no_progress_bar
    if len(args) != 2 and opts.feeds is None:
        p.print_help()
        return 1
    recipe = None
    if opts.feeds is not None:
        recipe = BasicNewsRecipe
    else:
        try:
-            if os.access(args[1], os.R_OK):
+            if os.access(recipe_arg, os.R_OK):
                try:
-                    recipe = compile_recipe(open(args[1]).read())
+                    recipe = compile_recipe(open(recipe_arg).read())
                except:
                    import traceback
                    traceback.print_exc()
@ -108,15 +107,13 @@ def main(args=sys.argv, notification=None, handler=None):
            else:
                raise Exception('not file')
        except:
-            recipe = get_builtin_recipe(args[1])
+            recipe = get_builtin_recipe(recipe_arg)
            if recipe is None:
-                recipe = compile_recipe(args[1])
+                recipe = compile_recipe(recipe_arg)
    if recipe is None:
-        p.print_help()
+        raise RecipeError(recipe_arg+ ' is an invalid recipe')
-        print
+        
        print args[1], 'is an invalid recipe'
        return 1
    if handler is None:
        from libprs500 import ColoredFormatter
@ -125,9 +122,23 @@ def main(args=sys.argv, notification=None, handler=None):
        handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
        logging.getLogger('feeds2disk').addHandler(handler)
-    recipe = recipe(opts, p, notification)
+    recipe = recipe(opts, parser, notification)
    if not os.path.exists(recipe.output_dir):
        os.makedirs(recipe.output_dir)
    recipe.download()
    return recipe
 def main(args=sys.argv, notification=None, handler=None):
    p = option_parser()
    opts, args = p.parse_args(args)
    if len(args) != 2 and opts.feeds is None:
        p.print_help()
        return 1
    recipe_arg = args[1] if len(args) > 1 else None
    run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)    
    return 0
 if __name__ == '__main__':
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -165,6 +165,8 @@ class BasicNewsRecipe(object):
        '''
        if not self.feeds:
            raise NotImplementedError
        if self.test:
            return self.feeds[:2]
        return self.feeds
    @classmethod
@ -225,9 +227,12 @@ class BasicNewsRecipe(object):
        @param parser:  Command line option parser. Used to intelligently merge options.
        @param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
        '''
-        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'):
+        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
            setattr(self, attr, getattr(options, attr))
        self.output_dir = os.path.abspath(self.output_dir)
        if options.test:
            self.max_articles_per_feed = 2
            self.simultaneous_downloads = min(4, self.simultaneous_downloads)
        self.logger = logging.getLogger('feeds2disk')
@ -288,11 +293,13 @@ class BasicNewsRecipe(object):
            self.simultaneous_downloads = 1
        self.navbar = templates.NavBarTemplate()
        self.max_articles_per_feed -= 1
        self.html2lrf_options.append('--use-spine')
        self.failed_downloads = []
        self.partial_failures = []
    def _postprocess_html(self, soup):
        if self.extra_css is not None:
            head = soup.find('head')
@ -383,6 +390,8 @@ class BasicNewsRecipe(object):
    def build_index(self):
        self.report_progress(0, _('Fetching feeds...'))
        feeds = self.parse_feeds()
        if self.test:
            feeds = feeds[:2]
        self.has_single_feed = len(feeds) == 1
        index = os.path.join(self.output_dir, 'index.html') 
@ -460,13 +469,14 @@ class BasicNewsRecipe(object):
        if dir is None:
            dir = self.output_dir
        mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
        mi.author_sort = __appname__
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
        opf = OPFCreator(dir, mi)
-        manifest = ['feed_%d'%i for i in range(len(feeds))]
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
-        manifest.append('index.html')
+        manifest.append(os.path.join(dir, 'index.html'))
        cpath = getattr(self, 'cover_path', None) 
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
             'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
             ]
-    extra_css = '#content { font:serif 1.2em; }'
+    extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}'
    keep_only_tags = [dict(name='div', id='content')]
    remove_tags = [
@ -54,11 +54,6 @@ class Newsweek(BasicNewsRecipe):
    recursions = 1
    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
    # For testing
    #feeds = feeds[3:5]
    #max_articles_per_feed = 2
    def postprocess_html(self,  soup):
        divs = list(soup.findAll('div', 'pagination'))
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -37,9 +37,11 @@ def basename(url):
    return res
 def save_soup(soup, target):
-    nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    meta = soup.find('meta', content=True)
+    nm = ns.find('meta')
-    if meta and 'charset' in meta['content']:
+    metas = soup.findAll('meta', content=True)
    for meta in metas:
        if 'charset' in meta['content']:
            meta.replaceWith(nm)
    f = codecs.open(target, 'w', 'utf-8')
    f.write(unicode(soup))